mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #620 from kiwix/search_caching
This commit is contained in:
commit
e48b550b68
|
@ -58,8 +58,6 @@ extern "C" {
|
|||
|
||||
#include <zim/uuid.h>
|
||||
#include <zim/error.h>
|
||||
#include <zim/search.h>
|
||||
#include <zim/suggestion.h>
|
||||
#include <zim/entry.h>
|
||||
#include <zim/item.h>
|
||||
|
||||
|
@ -80,6 +78,7 @@ extern "C" {
|
|||
|
||||
#define MAX_SEARCH_LEN 140
|
||||
#define KIWIX_MIN_CONTENT_SIZE_TO_DEFLATE 100
|
||||
#define DEFAULT_CACHE_SIZE 2
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
|
@ -96,6 +95,18 @@ inline std::string normalizeRootUrl(std::string rootUrl)
|
|||
return rootUrl.empty() ? rootUrl : "/" + rootUrl;
|
||||
}
|
||||
|
||||
// Returns the value of env var `name` if found, otherwise returns defaultVal
|
||||
unsigned int getCacheLength(const char* name, unsigned int defaultVal) {
|
||||
try {
|
||||
const char* envString = std::getenv(name);
|
||||
if (envString == nullptr) {
|
||||
throw std::runtime_error("Environment variable not set");
|
||||
}
|
||||
return extractFromString<unsigned int>(envString);
|
||||
} catch (...) {}
|
||||
|
||||
return defaultVal;
|
||||
}
|
||||
} // unnamed namespace
|
||||
|
||||
static IdNameMapper defaultNameMapper;
|
||||
|
@ -134,7 +145,10 @@ InternalServer::InternalServer(Library* library,
|
|||
m_ipConnectionLimit(ipConnectionLimit),
|
||||
mp_daemon(nullptr),
|
||||
mp_library(library),
|
||||
mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper)
|
||||
mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper),
|
||||
searcherCache(getCacheLength("SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))),
|
||||
searchCache(getCacheLength("SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)),
|
||||
suggestionSearcherCache(getCacheLength("SUGGESTION_SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U)))
|
||||
{}
|
||||
|
||||
bool InternalServer::start() {
|
||||
|
@ -339,14 +353,15 @@ std::unique_ptr<Response> InternalServer::build_homepage(const RequestContext& r
|
|||
* Archive and Zim handlers begin
|
||||
**/
|
||||
|
||||
// TODO: retrieve searcher from caching mechanism
|
||||
SuggestionsList_t getSuggestions(const zim::Archive* const archive,
|
||||
const std::string& queryString, int start, int suggestionCount)
|
||||
SuggestionsList_t getSuggestions(SuggestionSearcherCache& cache, const zim::Archive* const archive,
|
||||
const std::string& bookId, const std::string& queryString, int start, int suggestionCount)
|
||||
{
|
||||
SuggestionsList_t suggestions;
|
||||
auto searcher = zim::SuggestionSearcher(*archive);
|
||||
std::shared_ptr<zim::SuggestionSearcher> searcher;
|
||||
searcher = cache.getOrPut(bookId, [=](){ return make_shared<zim::SuggestionSearcher>(*archive); });
|
||||
|
||||
if (archive->hasTitleIndex()) {
|
||||
auto search = searcher.suggest(queryString);
|
||||
auto search = searcher->suggest(queryString);
|
||||
auto srs = search.getResults(start, suggestionCount);
|
||||
|
||||
for (auto it : srs) {
|
||||
|
@ -359,7 +374,7 @@ SuggestionsList_t getSuggestions(const zim::Archive* const archive,
|
|||
std::vector<std::string> variants = getTitleVariants(queryString);
|
||||
int currCount = 0;
|
||||
for (auto it = variants.begin(); it != variants.end() && currCount < suggestionCount; it++) {
|
||||
auto search = searcher.suggest(queryString);
|
||||
auto search = searcher->suggest(queryString);
|
||||
auto srs = search.getResults(0, suggestionCount);
|
||||
for (auto it : srs) {
|
||||
SuggestionItem suggestion(it.getTitle(), kiwix::normalize(it.getTitle()),
|
||||
|
@ -379,11 +394,11 @@ std::unique_ptr<Response> InternalServer::handle_suggest(const RequestContext& r
|
|||
printf("** running handle_suggest\n");
|
||||
}
|
||||
|
||||
std::string bookName;
|
||||
std::string bookName, bookId;
|
||||
std::shared_ptr<zim::Archive> archive;
|
||||
try {
|
||||
bookName = request.get_argument("content");
|
||||
const std::string bookId = mp_nameMapper->getIdForName(bookName);
|
||||
bookId = mp_nameMapper->getIdForName(bookName);
|
||||
archive = mp_library->getArchiveById(bookId);
|
||||
} catch (const std::out_of_range&) {
|
||||
// error handled by the archive == nullptr check below
|
||||
|
@ -410,7 +425,8 @@ std::unique_ptr<Response> InternalServer::handle_suggest(const RequestContext& r
|
|||
bool first = true;
|
||||
|
||||
/* Get the suggestions */
|
||||
SuggestionsList_t suggestions = getSuggestions(archive.get(), queryString, start, count);
|
||||
SuggestionsList_t suggestions = getSuggestions(suggestionSearcherCache, archive.get(),
|
||||
bookId, queryString, start, count);
|
||||
for(auto& suggestion:suggestions) {
|
||||
MustacheData result;
|
||||
result.set("label", suggestion.getTitle());
|
||||
|
@ -488,11 +504,11 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
} catch(const std::out_of_range&) {}
|
||||
catch(const std::invalid_argument&) {}
|
||||
|
||||
std::string bookName;
|
||||
std::string bookName, bookId;
|
||||
std::shared_ptr<zim::Archive> archive;
|
||||
try {
|
||||
bookName = request.get_argument("content");
|
||||
const std::string bookId = mp_nameMapper->getIdForName(bookName);
|
||||
bookId = mp_nameMapper->getIdForName(bookName);
|
||||
archive = mp_library->getArchiveById(bookId);
|
||||
} catch (const std::out_of_range&) {}
|
||||
|
||||
|
@ -509,7 +525,7 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
|
||||
std::shared_ptr<zim::Searcher> searcher;
|
||||
if (archive) {
|
||||
searcher = std::make_shared<zim::Searcher>(*archive);
|
||||
searcher = searcherCache.getOrPut(bookId, [=](){ return std::make_shared<zim::Searcher>(*archive);});
|
||||
} else {
|
||||
for (auto& bookId: mp_library->filter(kiwix::Filter().local(true).valid(true))) {
|
||||
auto currentArchive = mp_library->getArchiveById(bookId);
|
||||
|
@ -540,6 +556,7 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
}
|
||||
|
||||
/* Get the results */
|
||||
std::string queryString;
|
||||
try {
|
||||
zim::Query query;
|
||||
if (patternString.empty()) {
|
||||
|
@ -549,6 +566,7 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
}
|
||||
|
||||
query.setQuery("");
|
||||
queryString = "GEO:" + to_string(latitude) + to_string(longitude) + to_string(distance);
|
||||
query.setGeorange(latitude, longitude, distance);
|
||||
} else {
|
||||
// Execute Ft search
|
||||
|
@ -556,13 +574,16 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
cout << "Performing query `" << patternString << "'" << endl;
|
||||
}
|
||||
|
||||
std::string queryString = removeAccents(patternString);
|
||||
queryString = "FT:" + removeAccents(patternString);
|
||||
query.setQuery(queryString);
|
||||
}
|
||||
queryString = bookId + queryString;
|
||||
|
||||
zim::Search search = searcher->search(query);
|
||||
SearchRenderer renderer(search.getResults(start, pageLength), mp_nameMapper, mp_library, start,
|
||||
search.getEstimatedMatches());
|
||||
std::shared_ptr<zim::Search> search;
|
||||
search = searchCache.getOrPut(queryString, [=](){ return make_shared<zim::Search>(searcher->search(query));});
|
||||
|
||||
SearchRenderer renderer(search->getResults(start, pageLength), mp_nameMapper, mp_library, start,
|
||||
search->getEstimatedMatches());
|
||||
renderer.setSearchPattern(patternString);
|
||||
renderer.setSearchContent(bookName);
|
||||
renderer.setProtocolPrefix(m_root + "/");
|
||||
|
|
|
@ -28,6 +28,9 @@ extern "C" {
|
|||
#include "library.h"
|
||||
#include "name_mapper.h"
|
||||
|
||||
#include <zim/search.h>
|
||||
#include <zim/suggestion.h>
|
||||
|
||||
#include <mustache.hpp>
|
||||
|
||||
#include <atomic>
|
||||
|
@ -36,9 +39,14 @@ extern "C" {
|
|||
#include "server/request_context.h"
|
||||
#include "server/response.h"
|
||||
|
||||
#include "tools/concurrent_cache.h"
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
typedef kainjow::mustache::data MustacheData;
|
||||
typedef ConcurrentCache<string, std::shared_ptr<zim::Searcher>> SearcherCache;
|
||||
typedef ConcurrentCache<string, std::shared_ptr<zim::Search>> SearchCache;
|
||||
typedef ConcurrentCache<string, std::shared_ptr<zim::SuggestionSearcher>> SuggestionSearcherCache;
|
||||
|
||||
class Entry;
|
||||
class OPDSDumper;
|
||||
|
@ -115,6 +123,10 @@ class InternalServer {
|
|||
Library* mp_library;
|
||||
NameMapper* mp_nameMapper;
|
||||
|
||||
SearcherCache searcherCache;
|
||||
SearchCache searchCache;
|
||||
SuggestionSearcherCache suggestionSearcherCache;
|
||||
|
||||
std::string m_server_id;
|
||||
std::string m_library_id;
|
||||
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
|
||||
* Copyright (C) 2020 Veloman Yunkan
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ZIM_CONCURRENT_CACHE_H
|
||||
#define ZIM_CONCURRENT_CACHE_H
|
||||
|
||||
#include "lrucache.h"
|
||||
|
||||
#include <future>
|
||||
#include <mutex>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
|
||||
/**
|
||||
ConcurrentCache implements a concurrent thread-safe cache
|
||||
|
||||
Compared to kiwix::lru_cache, each access operation is slightly more expensive.
|
||||
However, different slots of the cache can be safely accessed concurrently
|
||||
with minimal blocking. Concurrent access to the same element is also
|
||||
safe, and, in case of a cache miss, will block until that element becomes
|
||||
available.
|
||||
*/
|
||||
template <typename Key, typename Value>
|
||||
class ConcurrentCache
|
||||
{
|
||||
private: // types
|
||||
typedef std::shared_future<Value> ValuePlaceholder;
|
||||
typedef lru_cache<Key, ValuePlaceholder> Impl;
|
||||
|
||||
public: // types
|
||||
explicit ConcurrentCache(size_t maxEntries)
|
||||
: impl_(maxEntries)
|
||||
{}
|
||||
|
||||
// Gets the entry corresponding to the given key. If the entry is not in the
|
||||
// cache, it is obtained by calling f() (without any arguments) and the
|
||||
// result is put into the cache.
|
||||
//
|
||||
// The cache as a whole is locked only for the duration of accessing
|
||||
// the respective slot. If, in the case of the a cache miss, the generation
|
||||
// of the missing element takes a long time, only attempts to access that
|
||||
// element will block - the rest of the cache remains open to concurrent
|
||||
// access.
|
||||
template<class F>
|
||||
Value getOrPut(const Key& key, F f)
|
||||
{
|
||||
std::promise<Value> valuePromise;
|
||||
std::unique_lock<std::mutex> l(lock_);
|
||||
const auto x = impl_.getOrPut(key, valuePromise.get_future().share());
|
||||
l.unlock();
|
||||
if ( x.miss() ) {
|
||||
try {
|
||||
valuePromise.set_value(f());
|
||||
} catch (std::exception& e) {
|
||||
drop(key);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
return x.value().get();
|
||||
}
|
||||
|
||||
bool drop(const Key& key)
|
||||
{
|
||||
std::unique_lock<std::mutex> l(lock_);
|
||||
return impl_.drop(key);
|
||||
}
|
||||
|
||||
private: // data
|
||||
Impl impl_;
|
||||
std::mutex lock_;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
||||
|
||||
#endif // ZIM_CONCURRENT_CACHE_H
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
/*
|
||||
* Copyrigth (c) 2021, Matthieu Gautier <mgautier@kymeria.fr>
|
||||
* Copyright (c) 2020, Veloman Yunkan
|
||||
* Copyright (c) 2014, lamerman
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the name of lamerman nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* File: lrucache.hpp
|
||||
* Author: Alexander Ponomarev
|
||||
*
|
||||
* Created on June 20, 2013, 5:09 PM
|
||||
*/
|
||||
|
||||
#ifndef _LRUCACHE_HPP_INCLUDED_
|
||||
#define _LRUCACHE_HPP_INCLUDED_
|
||||
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <cstddef>
|
||||
#include <stdexcept>
|
||||
#include <cassert>
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
template<typename key_t, typename value_t>
|
||||
class lru_cache {
|
||||
public: // types
|
||||
typedef typename std::pair<key_t, value_t> key_value_pair_t;
|
||||
typedef typename std::list<key_value_pair_t>::iterator list_iterator_t;
|
||||
|
||||
enum AccessStatus {
|
||||
HIT, // key was found in the cache
|
||||
PUT, // key was not in the cache but was created by the getOrPut() access
|
||||
MISS // key was not in the cache; get() access failed
|
||||
};
|
||||
|
||||
class AccessResult
|
||||
{
|
||||
const AccessStatus status_;
|
||||
const value_t val_;
|
||||
public:
|
||||
AccessResult(const value_t& val, AccessStatus status)
|
||||
: status_(status), val_(val)
|
||||
{}
|
||||
AccessResult() : status_(MISS), val_() {}
|
||||
|
||||
bool hit() const { return status_ == HIT; }
|
||||
bool miss() const { return !hit(); }
|
||||
const value_t& value() const
|
||||
{
|
||||
if ( status_ == MISS )
|
||||
throw std::range_error("There is no such key in cache");
|
||||
return val_;
|
||||
}
|
||||
|
||||
operator const value_t& () const { return value(); }
|
||||
};
|
||||
|
||||
public: // functions
|
||||
explicit lru_cache(size_t max_size) :
|
||||
_max_size(max_size) {
|
||||
}
|
||||
|
||||
// If 'key' is present in the cache, returns the associated value,
|
||||
// otherwise puts the given value into the cache (and returns it with
|
||||
// a status of a cache miss).
|
||||
AccessResult getOrPut(const key_t& key, const value_t& value) {
|
||||
auto it = _cache_items_map.find(key);
|
||||
if (it != _cache_items_map.end()) {
|
||||
_cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
|
||||
return AccessResult(it->second->second, HIT);
|
||||
} else {
|
||||
putMissing(key, value);
|
||||
return AccessResult(value, PUT);
|
||||
}
|
||||
}
|
||||
|
||||
void put(const key_t& key, const value_t& value) {
|
||||
auto it = _cache_items_map.find(key);
|
||||
if (it != _cache_items_map.end()) {
|
||||
_cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
|
||||
it->second->second = value;
|
||||
} else {
|
||||
putMissing(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
AccessResult get(const key_t& key) {
|
||||
auto it = _cache_items_map.find(key);
|
||||
if (it == _cache_items_map.end()) {
|
||||
return AccessResult();
|
||||
} else {
|
||||
_cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
|
||||
return AccessResult(it->second->second, HIT);
|
||||
}
|
||||
}
|
||||
|
||||
bool drop(const key_t& key) {
|
||||
try {
|
||||
auto list_it = _cache_items_map.at(key);
|
||||
_cache_items_list.erase(list_it);
|
||||
_cache_items_map.erase(key);
|
||||
return true;
|
||||
} catch (std::out_of_range& e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool exists(const key_t& key) const {
|
||||
return _cache_items_map.find(key) != _cache_items_map.end();
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return _cache_items_map.size();
|
||||
}
|
||||
|
||||
private: // functions
|
||||
void putMissing(const key_t& key, const value_t& value) {
|
||||
assert(_cache_items_map.find(key) == _cache_items_map.end());
|
||||
_cache_items_list.push_front(key_value_pair_t(key, value));
|
||||
_cache_items_map[key] = _cache_items_list.begin();
|
||||
if (_cache_items_map.size() > _max_size) {
|
||||
_cache_items_map.erase(_cache_items_list.back().first);
|
||||
_cache_items_list.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
private: // data
|
||||
std::list<key_value_pair_t> _cache_items_list;
|
||||
std::map<key_t, list_iterator_t> _cache_items_map;
|
||||
size_t _max_size;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
||||
|
||||
#endif /* _LRUCACHE_HPP_INCLUDED_ */
|
Loading…
Reference in New Issue