From a51f8d66a7d61c672e7cb04bb4b31f80c4c1db1d Mon Sep 17 00:00:00 2001 From: Maneesh P M Date: Tue, 12 Oct 2021 13:44:47 +0530 Subject: [PATCH 1/3] Introduce a LRU Cache and concurrent cache The cache is copied from libzim project : https://github.com/openzim/libzim The exact file as been copied from commit 27f5e70 --- src/tools/concurrent_cache.h | 95 +++++++++++++++++++++ src/tools/lrucache.h | 160 +++++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 src/tools/concurrent_cache.h create mode 100644 src/tools/lrucache.h diff --git a/src/tools/concurrent_cache.h b/src/tools/concurrent_cache.h new file mode 100644 index 000000000..1b5175b2e --- /dev/null +++ b/src/tools/concurrent_cache.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CONCURRENT_CACHE_H +#define ZIM_CONCURRENT_CACHE_H + +#include "lrucache.h" + +#include +#include + +namespace kiwix +{ + +/** + ConcurrentCache implements a concurrent thread-safe cache + + Compared to kiwix::lru_cache, each access operation is slightly more expensive. + However, different slots of the cache can be safely accessed concurrently + with minimal blocking. Concurrent access to the same element is also + safe, and, in case of a cache miss, will block until that element becomes + available. + */ +template +class ConcurrentCache +{ +private: // types + typedef std::shared_future ValuePlaceholder; + typedef lru_cache Impl; + +public: // types + explicit ConcurrentCache(size_t maxEntries) + : impl_(maxEntries) + {} + + // Gets the entry corresponding to the given key. If the entry is not in the + // cache, it is obtained by calling f() (without any arguments) and the + // result is put into the cache. + // + // The cache as a whole is locked only for the duration of accessing + // the respective slot. If, in the case of the a cache miss, the generation + // of the missing element takes a long time, only attempts to access that + // element will block - the rest of the cache remains open to concurrent + // access. + template + Value getOrPut(const Key& key, F f) + { + std::promise valuePromise; + std::unique_lock l(lock_); + const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); + l.unlock(); + if ( x.miss() ) { + try { + valuePromise.set_value(f()); + } catch (std::exception& e) { + drop(key); + throw; + } + } + + return x.value().get(); + } + + bool drop(const Key& key) + { + std::unique_lock l(lock_); + return impl_.drop(key); + } + +private: // data + Impl impl_; + std::mutex lock_; +}; + +} // namespace kiwix + +#endif // ZIM_CONCURRENT_CACHE_H + diff --git a/src/tools/lrucache.h b/src/tools/lrucache.h new file mode 100644 index 000000000..bd90c3128 --- /dev/null +++ b/src/tools/lrucache.h @@ -0,0 +1,160 @@ +/* + * Copyrigth (c) 2021, Matthieu Gautier + * Copyright (c) 2020, Veloman Yunkan + * Copyright (c) 2014, lamerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of lamerman nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * File: lrucache.hpp + * Author: Alexander Ponomarev + * + * Created on June 20, 2013, 5:09 PM + */ + +#ifndef _LRUCACHE_HPP_INCLUDED_ +#define _LRUCACHE_HPP_INCLUDED_ + +#include +#include +#include +#include +#include + +namespace kiwix { + +template +class lru_cache { +public: // types + typedef typename std::pair key_value_pair_t; + typedef typename std::list::iterator list_iterator_t; + + enum AccessStatus { + HIT, // key was found in the cache + PUT, // key was not in the cache but was created by the getOrPut() access + MISS // key was not in the cache; get() access failed + }; + + class AccessResult + { + const AccessStatus status_; + const value_t val_; + public: + AccessResult(const value_t& val, AccessStatus status) + : status_(status), val_(val) + {} + AccessResult() : status_(MISS), val_() {} + + bool hit() const { return status_ == HIT; } + bool miss() const { return !hit(); } + const value_t& value() const + { + if ( status_ == MISS ) + throw std::range_error("There is no such key in cache"); + return val_; + } + + operator const value_t& () const { return value(); } + }; + +public: // functions + explicit lru_cache(size_t max_size) : + _max_size(max_size) { + } + + // If 'key' is present in the cache, returns the associated value, + // otherwise puts the given value into the cache (and returns it with + // a status of a cache miss). + AccessResult getOrPut(const key_t& key, const value_t& value) { + auto it = _cache_items_map.find(key); + if (it != _cache_items_map.end()) { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + return AccessResult(it->second->second, HIT); + } else { + putMissing(key, value); + return AccessResult(value, PUT); + } + } + + void put(const key_t& key, const value_t& value) { + auto it = _cache_items_map.find(key); + if (it != _cache_items_map.end()) { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + it->second->second = value; + } else { + putMissing(key, value); + } + } + + AccessResult get(const key_t& key) { + auto it = _cache_items_map.find(key); + if (it == _cache_items_map.end()) { + return AccessResult(); + } else { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + return AccessResult(it->second->second, HIT); + } + } + + bool drop(const key_t& key) { + try { + auto list_it = _cache_items_map.at(key); + _cache_items_list.erase(list_it); + _cache_items_map.erase(key); + return true; + } catch (std::out_of_range& e) { + return false; + } + } + + bool exists(const key_t& key) const { + return _cache_items_map.find(key) != _cache_items_map.end(); + } + + size_t size() const { + return _cache_items_map.size(); + } + +private: // functions + void putMissing(const key_t& key, const value_t& value) { + assert(_cache_items_map.find(key) == _cache_items_map.end()); + _cache_items_list.push_front(key_value_pair_t(key, value)); + _cache_items_map[key] = _cache_items_list.begin(); + if (_cache_items_map.size() > _max_size) { + _cache_items_map.erase(_cache_items_list.back().first); + _cache_items_list.pop_back(); + } + } + +private: // data + std::list _cache_items_list; + std::map _cache_items_map; + size_t _max_size; +}; + +} // namespace kiwix + +#endif /* _LRUCACHE_HPP_INCLUDED_ */ From 7cb4c1361fad174c0aa44a81c8b138096224233a Mon Sep 17 00:00:00 2001 From: Maneesh P M Date: Tue, 12 Oct 2021 13:45:18 +0530 Subject: [PATCH 2/3] Retrieve Searcher and Search from LRU Cache We use the new cache template to implement two kind of cache. 1: The Searcher cache is more general in terms of its usage. A Searcher can be used for multiple searches without much change to itself. We try to retrieve the searcher and perform searches using it whenever possible, and if not we put a searcher into the cache. User can specify a custom cache length by manipulating the environment variable SEARCHER_CACHE_SIZE. It's default value is 10% of all the books available. 2: The search cache is much more restricted in terms of usage. It's main purpose is to avoid re-searching on the searcher during page changes to generate SearchResultSet of various ranges. User can specify a custom cache length using the environment variable SEARCH_CACHE_SIZE with a default value of 2; --- src/server/internalServer.cpp | 38 ++++++++++++++++++++++++++--------- src/server/internalServer.h | 10 +++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/server/internalServer.cpp b/src/server/internalServer.cpp index df17033c1..e2ead7bce 100644 --- a/src/server/internalServer.cpp +++ b/src/server/internalServer.cpp @@ -58,8 +58,6 @@ extern "C" { #include #include -#include -#include #include #include @@ -80,6 +78,7 @@ extern "C" { #define MAX_SEARCH_LEN 140 #define KIWIX_MIN_CONTENT_SIZE_TO_DEFLATE 100 +#define DEFAULT_CACHE_SIZE 2 namespace kiwix { @@ -96,6 +95,18 @@ inline std::string normalizeRootUrl(std::string rootUrl) return rootUrl.empty() ? rootUrl : "/" + rootUrl; } +// Returns the value of env var `name` if found, otherwise returns defaultVal +unsigned int getCacheLength(const char* name, unsigned int defaultVal) { + try { + const char* envString = std::getenv(name); + if (envString == nullptr) { + throw std::runtime_error("Environment variable not set"); + } + return extractFromString(envString); + } catch (...) {} + + return defaultVal; +} } // unnamed namespace static IdNameMapper defaultNameMapper; @@ -134,7 +145,9 @@ InternalServer::InternalServer(Library* library, m_ipConnectionLimit(ipConnectionLimit), mp_daemon(nullptr), mp_library(library), - mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper) + mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper), + searcherCache(getCacheLength("SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))), + searchCache(getCacheLength("SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)) {} bool InternalServer::start() { @@ -488,11 +501,11 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re } catch(const std::out_of_range&) {} catch(const std::invalid_argument&) {} - std::string bookName; + std::string bookName, bookId; std::shared_ptr archive; try { bookName = request.get_argument("content"); - const std::string bookId = mp_nameMapper->getIdForName(bookName); + bookId = mp_nameMapper->getIdForName(bookName); archive = mp_library->getArchiveById(bookId); } catch (const std::out_of_range&) {} @@ -509,7 +522,7 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re std::shared_ptr searcher; if (archive) { - searcher = std::make_shared(*archive); + searcher = searcherCache.getOrPut(bookId, [=](){ return std::make_shared(*archive);}); } else { for (auto& bookId: mp_library->filter(kiwix::Filter().local(true).valid(true))) { auto currentArchive = mp_library->getArchiveById(bookId); @@ -540,6 +553,7 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re } /* Get the results */ + std::string queryString; try { zim::Query query; if (patternString.empty()) { @@ -549,6 +563,7 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re } query.setQuery(""); + queryString = "GEO:" + to_string(latitude) + to_string(longitude) + to_string(distance); query.setGeorange(latitude, longitude, distance); } else { // Execute Ft search @@ -556,13 +571,16 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re cout << "Performing query `" << patternString << "'" << endl; } - std::string queryString = removeAccents(patternString); + queryString = "FT:" + removeAccents(patternString); query.setQuery(queryString); } + queryString = bookId + queryString; - zim::Search search = searcher->search(query); - SearchRenderer renderer(search.getResults(start, pageLength), mp_nameMapper, mp_library, start, - search.getEstimatedMatches()); + std::shared_ptr search; + search = searchCache.getOrPut(queryString, [=](){ return make_shared(searcher->search(query));}); + + SearchRenderer renderer(search->getResults(start, pageLength), mp_nameMapper, mp_library, start, + search->getEstimatedMatches()); renderer.setSearchPattern(patternString); renderer.setSearchContent(bookName); renderer.setProtocolPrefix(m_root + "/"); diff --git a/src/server/internalServer.h b/src/server/internalServer.h index f4ae08eea..e808f8ee9 100644 --- a/src/server/internalServer.h +++ b/src/server/internalServer.h @@ -28,6 +28,9 @@ extern "C" { #include "library.h" #include "name_mapper.h" +#include +#include + #include #include @@ -36,9 +39,13 @@ extern "C" { #include "server/request_context.h" #include "server/response.h" +#include "tools/concurrent_cache.h" + namespace kiwix { typedef kainjow::mustache::data MustacheData; +typedef ConcurrentCache> SearcherCache; +typedef ConcurrentCache> SearchCache; class Entry; class OPDSDumper; @@ -115,6 +122,9 @@ class InternalServer { Library* mp_library; NameMapper* mp_nameMapper; + SearcherCache searcherCache; + SearchCache searchCache; + std::string m_server_id; std::string m_library_id; From 6523d9f563e416b6983f25689b0404ed18079053 Mon Sep 17 00:00:00 2001 From: Maneesh P M Date: Wed, 13 Oct 2021 20:52:02 +0530 Subject: [PATCH 3/3] Retrieve SuggestionSearcher from LRU Cache We create a cache for SuggestionSearcher very similar to that of FT searcher. User can specify a custom cache size using the environment variable SUGGESTION_SEARCHER_CACHE_SIZE. It has a default value of 10% of the number of books in the library. --- src/server/internalServer.cpp | 23 +++++++++++++---------- src/server/internalServer.h | 2 ++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/server/internalServer.cpp b/src/server/internalServer.cpp index e2ead7bce..328c7636c 100644 --- a/src/server/internalServer.cpp +++ b/src/server/internalServer.cpp @@ -147,7 +147,8 @@ InternalServer::InternalServer(Library* library, mp_library(library), mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper), searcherCache(getCacheLength("SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))), - searchCache(getCacheLength("SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)) + searchCache(getCacheLength("SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)), + suggestionSearcherCache(getCacheLength("SUGGESTION_SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))) {} bool InternalServer::start() { @@ -352,14 +353,15 @@ std::unique_ptr InternalServer::build_homepage(const RequestContext& r * Archive and Zim handlers begin **/ -// TODO: retrieve searcher from caching mechanism -SuggestionsList_t getSuggestions(const zim::Archive* const archive, - const std::string& queryString, int start, int suggestionCount) +SuggestionsList_t getSuggestions(SuggestionSearcherCache& cache, const zim::Archive* const archive, + const std::string& bookId, const std::string& queryString, int start, int suggestionCount) { SuggestionsList_t suggestions; - auto searcher = zim::SuggestionSearcher(*archive); + std::shared_ptr searcher; + searcher = cache.getOrPut(bookId, [=](){ return make_shared(*archive); }); + if (archive->hasTitleIndex()) { - auto search = searcher.suggest(queryString); + auto search = searcher->suggest(queryString); auto srs = search.getResults(start, suggestionCount); for (auto it : srs) { @@ -372,7 +374,7 @@ SuggestionsList_t getSuggestions(const zim::Archive* const archive, std::vector variants = getTitleVariants(queryString); int currCount = 0; for (auto it = variants.begin(); it != variants.end() && currCount < suggestionCount; it++) { - auto search = searcher.suggest(queryString); + auto search = searcher->suggest(queryString); auto srs = search.getResults(0, suggestionCount); for (auto it : srs) { SuggestionItem suggestion(it.getTitle(), kiwix::normalize(it.getTitle()), @@ -392,11 +394,11 @@ std::unique_ptr InternalServer::handle_suggest(const RequestContext& r printf("** running handle_suggest\n"); } - std::string bookName; + std::string bookName, bookId; std::shared_ptr archive; try { bookName = request.get_argument("content"); - const std::string bookId = mp_nameMapper->getIdForName(bookName); + bookId = mp_nameMapper->getIdForName(bookName); archive = mp_library->getArchiveById(bookId); } catch (const std::out_of_range&) { // error handled by the archive == nullptr check below @@ -423,7 +425,8 @@ std::unique_ptr InternalServer::handle_suggest(const RequestContext& r bool first = true; /* Get the suggestions */ - SuggestionsList_t suggestions = getSuggestions(archive.get(), queryString, start, count); + SuggestionsList_t suggestions = getSuggestions(suggestionSearcherCache, archive.get(), + bookId, queryString, start, count); for(auto& suggestion:suggestions) { MustacheData result; result.set("label", suggestion.getTitle()); diff --git a/src/server/internalServer.h b/src/server/internalServer.h index e808f8ee9..6ccde9f8f 100644 --- a/src/server/internalServer.h +++ b/src/server/internalServer.h @@ -46,6 +46,7 @@ namespace kiwix { typedef kainjow::mustache::data MustacheData; typedef ConcurrentCache> SearcherCache; typedef ConcurrentCache> SearchCache; +typedef ConcurrentCache> SuggestionSearcherCache; class Entry; class OPDSDumper; @@ -124,6 +125,7 @@ class InternalServer { SearcherCache searcherCache; SearchCache searchCache; + SuggestionSearcherCache suggestionSearcherCache; std::string m_server_id; std::string m_library_id;