catching up with master

This commit is contained in:
renaud gaudin 2013-12-09 12:02:58 +00:00
commit fd9c9ac17e
7 changed files with 225 additions and 140 deletions

View File

@ -67,6 +67,7 @@ namespace kiwix {
string publisher;
string date;
string url;
string origId;
string articleCount;
string mediaCount;
bool readOnly;

View File

@ -56,6 +56,7 @@ namespace kiwix {
book.creator = bookNode.attribute("creator").value();
book.publisher = bookNode.attribute("publisher").value();
book.url = bookNode.attribute("url").value();
book.origId = bookNode.attribute("origId").value();
book.articleCount = bookNode.attribute("articleCount").value();
book.mediaCount = bookNode.attribute("mediaCount").value();
book.size = bookNode.attribute("size").value();
@ -154,41 +155,46 @@ namespace kiwix {
bookNode.append_attribute("indexType") = "xapian";
}
if (!itr->title.empty())
bookNode.append_attribute("title") = itr->title.c_str();
if (itr->origId.empty()) {
if (!itr->title.empty())
bookNode.append_attribute("title") = itr->title.c_str();
if (!itr->description.empty())
bookNode.append_attribute("description") = itr->description.c_str();
if (!itr->language.empty())
bookNode.append_attribute("language") = itr->language.c_str();
if (!itr->creator.empty())
bookNode.append_attribute("creator") = itr->creator.c_str();
if (!itr->publisher.empty())
bookNode.append_attribute("publisher") = itr->publisher.c_str();
if (!itr->favicon.empty())
bookNode.append_attribute("favicon") = itr->favicon.c_str();
if (!itr->faviconMimeType.empty())
bookNode.append_attribute("faviconMimeType") = itr->faviconMimeType.c_str();
}
if (itr->description != "")
bookNode.append_attribute("description") = itr->description.c_str();
if (itr->language != "")
bookNode.append_attribute("language") = itr->language.c_str();
if (itr->date != "")
if (!itr->date.empty())
bookNode.append_attribute("date") = itr->date.c_str();
if (itr->creator != "")
bookNode.append_attribute("creator") = itr->creator.c_str();
if (itr->publisher != "")
bookNode.append_attribute("publisher") = itr->publisher.c_str();
if (itr->url != "")
if (!itr->url.empty())
bookNode.append_attribute("url") = itr->url.c_str();
if (itr->articleCount != "")
if (!itr->origId.empty())
bookNode.append_attribute("origId") = itr->origId.c_str();
if (!itr->articleCount.empty())
bookNode.append_attribute("articleCount") = itr->articleCount.c_str();
if (itr->mediaCount != "")
if (!itr->mediaCount.empty())
bookNode.append_attribute("mediaCount") = itr->mediaCount.c_str();
if (itr->size != "")
if (!itr->size.empty())
bookNode.append_attribute("size") = itr->size.c_str();
if (itr->favicon != "")
bookNode.append_attribute("favicon") = itr->favicon.c_str();
if (itr->faviconMimeType != "")
bookNode.append_attribute("faviconMimeType") = itr->faviconMimeType.c_str();
}
}
@ -256,7 +262,7 @@ namespace kiwix {
book->creator = reader->getCreator();
book->publisher = reader->getPublisher();
book->title = reader->getTitle();
book->origId = reader->getOrigId();
std::ostringstream articleCountStream;
articleCountStream << reader->getArticleCount();
book->articleCount = articleCountStream.str();
@ -307,10 +313,12 @@ namespace kiwix {
std::map<string, bool> booksLanguagesMap;
std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByLanguage);
for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) {
for (itr = library.books.begin(); itr != library.books.end(); ++itr) {
if (booksLanguagesMap.find(itr->language) == booksLanguagesMap.end()) {
booksLanguagesMap[itr->language] = true;
booksLanguages.push_back(itr->language);
if (itr->origId.empty()) {
booksLanguagesMap[itr->language] = true;
booksLanguages.push_back(itr->language);
}
}
}
@ -323,10 +331,12 @@ namespace kiwix {
std::map<string, bool> booksCreatorsMap;
std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByCreator);
for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) {
for (itr = library.books.begin(); itr != library.books.end(); ++itr) {
if (booksCreatorsMap.find(itr->creator) == booksCreatorsMap.end()) {
booksCreatorsMap[itr->creator] = true;
booksCreators.push_back(itr->creator);
if (itr->origId.empty()) {
booksCreatorsMap[itr->creator] = true;
booksCreators.push_back(itr->creator);
}
}
}
@ -353,8 +363,10 @@ namespace kiwix {
std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByPublisher);
for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) {
if (booksPublishersMap.find(itr->publisher) == booksPublishersMap.end()) {
booksPublishersMap[itr->publisher] = true;
booksPublishers.push_back(itr->publisher);
if (itr->origId.empty()) {
booksPublishersMap[itr->publisher] = true;
booksPublishers.push_back(itr->publisher);
}
}
}

View File

@ -40,7 +40,7 @@ namespace kiwix {
enum supportedListSortBy { TITLE, SIZE, DATE, CREATOR, PUBLISHER };
class Manager {
public:
Manager();
~Manager();
@ -55,9 +55,9 @@ namespace kiwix {
string getCurrentBookId();
bool setBookIndex(const string id, const string path, const supportedIndexType type);
bool setBookPath(const string id, const string path);
string addBookFromPathAndGetId(const string pathToOpen, const string pathToSave = "", const string url = "",
string addBookFromPathAndGetId(const string pathToOpen, const string pathToSave = "", const string url = "",
const bool checkMetaData = false);
bool addBookFromPath(const string pathToOpen, const string pathToSave = "", const string url = "",
bool addBookFromPath(const string pathToOpen, const string pathToSave = "", const string url = "",
const bool checkMetaData = false);
Library cloneLibrary();
bool getBookById(const string id, Book &book);
@ -65,7 +65,7 @@ namespace kiwix {
unsigned int getBookCount(const bool localBooks, const bool remoteBooks);
bool updateBookLastOpenDateById(const string id);
void removeBookPaths();
bool listBooks(const supportedListMode mode, const supportedListSortBy sortBy, const unsigned int maxSize,
bool listBooks(const supportedListMode mode, const supportedListSortBy sortBy, const unsigned int maxSize,
const string language, const string creator, const string publisher, const string search);
vector<string> getBooksLanguages();
vector<string> getBooksCreators();
@ -75,10 +75,10 @@ namespace kiwix {
string writableLibraryPath;
vector<std::string> bookIdList;
protected:
kiwix::Library library;
bool readBookFromPath(const string path, Book *book = NULL);
bool parseXmlDom(const pugi::xml_document &doc, const bool readOnly, const string libraryPath);

View File

@ -19,6 +19,38 @@
#include "reader.h"
inline char hi(char v) {
char hex[] = "0123456789abcdef";
return hex[(v >> 4) & 0xf];
}
inline char lo(char v) {
char hex[] = "0123456789abcdef";
return hex[v & 0xf];
}
std::string hexUUID (std::string in) {
std::ostringstream out;
for (unsigned n = 0; n < 4; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 4; n < 6; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 6; n < 8; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 8; n < 10; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 10; n < 16; ++n)
out << hi(in[n]) << lo(in[n]);
std::string op=out.str();
return op;
}
static char charFromHex(std::string a) {
std::istringstream Blat (a);
int Z;
@ -28,9 +60,10 @@ static char charFromHex(std::string a) {
void unescapeUrl(string &url) {
std::string::size_type pos = 0;
while ((pos = url.find('%', pos + 1)) != std::string::npos &&
pos + 3 <= url.length()) {
while ((pos = url.find('%', pos)) != std::string::npos &&
pos + 2 < url.length()) {
url.replace(pos, 3, 1, charFromHex(url.substr(pos + 1, 2)));
++pos;
}
return;
}
@ -38,14 +71,14 @@ void unescapeUrl(string &url) {
namespace kiwix {
/* Constructor */
Reader::Reader(const string zimFilePath)
Reader::Reader(const string zimFilePath)
: zimFileHandler(NULL) {
string tmpZimFilePath = zimFilePath;
/* Remove potential trailing zimaa */
size_t found = tmpZimFilePath.rfind("zimaa");
if (found != string::npos &&
tmpZimFilePath.size() > 5 &&
if (found != string::npos &&
tmpZimFilePath.size() > 5 &&
found == tmpZimFilePath.size() - 5) {
tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
}
@ -63,7 +96,7 @@ namespace kiwix {
/* initialize random seed: */
srand ( time(NULL) );
}
/* Destructor */
Reader::~Reader() {
if (this->zimFileHandler != NULL) {
@ -74,7 +107,7 @@ namespace kiwix {
zim::File* Reader::getZimFileHandler() {
return this->zimFileHandler;
}
/* Reset the cursor for GetNextArticle() */
void Reader::reset() {
this->currentArticleOffset = this->firstArticleOffset;
@ -101,12 +134,12 @@ namespace kiwix {
return counters;
}
/* Get the count of articles which can be indexed/displayed */
unsigned int Reader::getArticleCount() {
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata();
unsigned int counter = 0;
if (counterMap.empty()) {
counter = this->nsACount;
} else {
@ -114,7 +147,7 @@ namespace kiwix {
if (it != counterMap.end())
counter = it->second;
}
return counter;
}
@ -140,10 +173,10 @@ namespace kiwix {
if (it != counterMap.end())
counter += it->second;
}
return counter;
}
/* Get the total of all items of a ZIM file, redirects included */
unsigned int Reader::getGlobalCount() {
return this->zimFileHandler->getCountArticles();
@ -155,7 +188,7 @@ namespace kiwix {
s << this->zimFileHandler->getFileheader().getUuid();
return s.str();
}
/* Return a page url from a title */
bool Reader::getPageUrlFromTitle(const string &title, string &url) {
/* Extract the content from the zim file */
@ -163,7 +196,7 @@ namespace kiwix {
/* Test if the article was found */
if (resultPair.first == true) {
/* Get the article */
zim::Article article = *resultPair.second;
@ -172,7 +205,7 @@ namespace kiwix {
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
url = article.getLongUrl();
return true;
}
@ -182,53 +215,53 @@ namespace kiwix {
/* Return an URL from a title*/
string Reader::getRandomPageUrl() {
zim::size_type idx = this->firstArticleOffset +
(zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount);
zim::size_type idx = this->firstArticleOffset +
(zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount);
zim::Article article = zimFileHandler->getArticle(idx);
return article.getLongUrl().c_str();
}
/* Return the welcome page URL */
string Reader::getMainPageUrl() {
string url = "";
if (this->zimFileHandler->getFileheader().hasMainPage()) {
zim::Article article = zimFileHandler->getArticle(this->zimFileHandler->getFileheader().getMainPage());
url = article.getLongUrl();
if (url.empty()) {
url = getFirstPageUrl();
url = getFirstPageUrl();
}
} else {
url = getFirstPageUrl();
url = getFirstPageUrl();
}
return url;
}
bool Reader::getFavicon(string &content, string &mimeType) {
unsigned int contentLength = 0;
this->getContentByUrl( "/-/favicon.png", content,
this->getContentByUrl( "/-/favicon.png", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/I/favicon.png", content,
this->getContentByUrl( "/I/favicon.png", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/I/favicon", content,
this->getContentByUrl( "/I/favicon", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/-/favicon", content,
this->getContentByUrl( "/-/favicon", content,
contentLength, mimeType);
}
}
}
return content.empty() ? false : true;
}
@ -236,11 +269,11 @@ namespace kiwix {
bool Reader::getMetatag(const string &name, string &value) {
unsigned int contentLength = 0;
string contentType = "";
return this->getContentByUrl( "/M/" + name, value,
return this->getContentByUrl( "/M/" + name, value,
contentLength, contentType);
}
string Reader::getTitle() {
string value;
this->getMetatag("Title", value);
@ -256,7 +289,7 @@ namespace kiwix {
string Reader::getDescription() {
string value;
this->getMetatag("Description", value);
/* Mediawiki Collection tends to use the "Subtitle" name */
if (value.empty()) {
this->getMetatag("Subtitle", value);
@ -289,34 +322,61 @@ namespace kiwix {
return value;
}
string Reader::getOrigId() {
string value;
this->getMetatag("startfileuid", value);
if(value.empty())
return "";
std::string id=value;
std::string origID;
std::string temp="";
unsigned int k=0;
char tempArray[16]="";
for(unsigned int i=0; i<id.size(); i++)
{
if(id[i]=='\n')
{
tempArray[k]= atoi(temp.c_str());
temp="";
k++;
}
else
{
temp+=id[i];
}
}
origID=hexUUID(tempArray);
return origID;
}
/* Return the first page URL */
string Reader::getFirstPageUrl() {
string url;
zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A');
zim::Article article = zimFileHandler->getArticle(firstPageOffset);
url = article.getLongUrl();
return url;
}
bool Reader::parseUrl(const string &url, char *ns, string &title) {
/* Offset to visit the url */
unsigned int urlLength = url.size();
unsigned int offset = 0;
/* Ignore the '/' */
while ((offset < urlLength) && (url[offset] == '/')) offset++;
/* Get namespace */
while ((offset < urlLength) && (url[offset] != '/')) {
*ns= url[offset];
offset++;
}
/* Ignore the '/' */
while ((offset < urlLength) && (url[offset] == '/')) offset++;
while ((offset < urlLength) && (url[offset] == '/')) offset++;
/* Get content title */
unsigned int titleOffset = offset;
while (offset < urlLength) {
@ -338,7 +398,7 @@ namespace kiwix {
contentLength = 0;
if (this->zimFileHandler != NULL) {
/* Parse the url */
char ns = 0;
string titleStr;
@ -348,68 +408,72 @@ namespace kiwix {
if (titleStr.empty() && ns == 0) {
this->parseUrl(this->getMainPageUrl(), &ns, titleStr);
}
/* Extract the content from the zim file */
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns, titleStr);
/* Test if the article was found */
if (resultPair.first == true) {
/* Get the article */
zim::Article article = zimFileHandler->getArticle(resultPair.second.getIndex());
/* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
/* Get the content mime-type */
contentType = string(article.getMimeType().data(), article.getMimeType().size());
contentType = string(article.getMimeType().data(), article.getMimeType().size());
/* Get the data */
content = string(article.getData().data(), article.getArticleSize());
/* Try to set a stub HTML header/footer if necesssary */
if (contentType == "text/html" && std::string::npos == content.find("<body>")) {
content = "<html><head><title>" + article.getTitle() + "</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body>" + content + "</body></html>";
}
/* Get the data length */
contentLength = article.getArticleSize();
/* Set return value */
retVal = true;
}
}
return retVal;
}
/* Search titles by prefix */
bool Reader::searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset) {
bool retVal = false;
zim::File::const_iterator articleItr;
zim::File::const_iterator articleItr;
std::vector<std::string>::iterator suggestionItr;
int result;
/* Reset the suggestions */
/* Reset the suggestions otherwise check if the suggestions number is less than the suggestionsCount */
if (reset) {
this->suggestions.clear();
} else {
if (this->suggestions.size() > suggestionsCount) {
return false;
}
}
if (prefix.size()) {
for (articleItr = zimFileHandler->findByTitle('A', prefix);
articleItr != zimFileHandler->end() &&
articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 &&
this->suggestions.size() < suggestionsCount ;
articleItr != zimFileHandler->end() &&
articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 &&
this->suggestions.size() < suggestionsCount ;
++articleItr) {
if (this->suggestions.size() == 0) {
this->suggestions.push_back(articleItr->getTitle());
} else {
for (suggestionItr = this->suggestions.begin() ;
suggestionItr != this->suggestions.end();
} else if (this->suggestions.size() < suggestionsCount) {
for (suggestionItr = this->suggestions.begin() ;
suggestionItr != this->suggestions.end();
++suggestionItr) {
result = articleItr->getTitle().compare(*suggestionItr);
@ -425,25 +489,25 @@ namespace kiwix {
this->suggestions.push_back(articleItr->getTitle());
}
}
/* Suggestions where found */
retVal = true;
}
}
/* Set the cursor to the begining */
this->suggestionsOffset = this->suggestions.begin();
return retVal;
}
/* Try also a few variations of the prefix to have better results */
bool Reader::searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount) {
std::string myPrefix = prefix;
/* Normal suggestion request */
bool retVal = this->searchSuggestions(prefix, suggestionsCount, true);
/* Try with first letter uppercase */
myPrefix = kiwix::ucFirst(myPrefix);
this->searchSuggestions(myPrefix, suggestionsCount, false);
@ -452,6 +516,10 @@ namespace kiwix {
myPrefix = kiwix::lcFirst(myPrefix);
this->searchSuggestions(myPrefix, suggestionsCount, false);
/* Try with title words */
myPrefix = kiwix::toTitle(myPrefix);
this->searchSuggestions(myPrefix, suggestionsCount, false);
return retVal;
}
@ -460,10 +528,10 @@ namespace kiwix {
if (this->suggestionsOffset != this->suggestions.end()) {
/* title */
title = *(this->suggestionsOffset);
/* increment the cursor for the next call */
this->suggestionsOffset++;
return true;
}
@ -492,7 +560,7 @@ namespace kiwix {
unsigned int Reader::getFileSize() {
zim::File *file = this->getZimFileHandler();
zim::offset_type size = 0;
if (file != NULL) {
size = file->getFilesize();
}

View File

@ -38,7 +38,7 @@ using namespace std;
namespace kiwix {
class Reader {
public:
Reader(const string zimFilePath);
~Reader();
@ -58,6 +58,7 @@ namespace kiwix {
string getDate();
string getCreator();
string getPublisher();
string getOrigId();
bool getFavicon(string &content, string &mimeType);
bool getPageUrlFromTitle(const string &title, string &url);
bool getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType);
@ -69,7 +70,7 @@ namespace kiwix {
bool parseUrl(const string &url, char *ns, string &title);
unsigned int getFileSize();
zim::File* getZimFileHandler();
protected:
zim::File* zimFileHandler;
zim::size_type firstArticleOffset;
@ -77,7 +78,7 @@ namespace kiwix {
zim::size_type currentArticleOffset;
zim::size_type nsACount;
zim::size_type nsICount;
std::vector<std::string> suggestions;
std::vector<std::string>::iterator suggestionsOffset;

View File

@ -174,36 +174,40 @@ std::string kiwix::ucFirst (const std::string &word) {
if (word.empty())
return "";
std::string ucFirstWord;
std::string result;
#ifdef __ANDROID__
ucFirstWord = word;
ucFirstWord[0] = toupper(ucFirstWord[0]);
#else
UnicodeString firstLetter = UnicodeString(word.substr(0, 1).c_str());
UnicodeString ucFirstLetter = firstLetter.toUpper();
ucFirstLetter.toUTF8String(ucFirstWord);
ucFirstWord += word.substr(1);
#endif
UnicodeString unicodeWord(word.c_str());
UnicodeString unicodeFirstLetter = unicodeWord.tempSubString(0, 1).toUpper();
unicodeWord.replace(0, 1, unicodeFirstLetter);
unicodeWord.toUTF8String(result);
return ucFirstWord;
return result;
}
std::string kiwix::lcFirst (const std::string &word) {
if (word.empty())
return "";
std::string ucFirstWord;
std::string result;
#ifdef __ANDROID__
ucFirstWord = word;
ucFirstWord[0] = tolower(ucFirstWord[0]);
#else
UnicodeString firstLetter = UnicodeString(word.substr(0, 1).c_str());
UnicodeString ucFirstLetter = firstLetter.toLower();
ucFirstLetter.toUTF8String(ucFirstWord);
ucFirstWord += word.substr(1);
#endif
UnicodeString unicodeWord(word.c_str());
UnicodeString unicodeFirstLetter = unicodeWord.tempSubString(0, 1).toLower();
unicodeWord.replace(0, 1, unicodeFirstLetter);
unicodeWord.toUTF8String(result);
return ucFirstWord;
return result;
}
std::string kiwix::toTitle (const std::string &word) {
if (word.empty())
return "";
std::string result;
UnicodeString unicodeWord(word.c_str());
unicodeWord = unicodeWord.toTitle(0);
unicodeWord.toUTF8String(result);
return result;
}

View File

@ -20,7 +20,6 @@
#ifndef KIWIX_STRINGTOOLS_H
#define KIWIX_STRINGTOOLS_H
#ifndef __ANDROID__
#include <unicode/translit.h>
#include <unicode/normlzr.h>
#include <unicode/unistr.h>
@ -29,7 +28,6 @@
#include <unicode/uniset.h>
#include <unicode/ustring.h>
#include <unicode/ucnv.h>
#endif
#include <iostream>
#include <vector>
@ -58,6 +56,7 @@ namespace kiwix {
std::string ucFirst(const std::string &word);
std::string lcFirst(const std::string &word);
std::string toTitle(const std::string &word);
}
#endif