mirror of https://github.com/kiwix/libkiwix.git
+ replace libunac by libicu
This commit is contained in:
parent
40042efe06
commit
27ee27bbe9
|
@ -20,7 +20,7 @@ namespace kiwix {
|
||||||
: zimFileHandler(NULL),
|
: zimFileHandler(NULL),
|
||||||
articleCount(0),
|
articleCount(0),
|
||||||
stepSize(0) {
|
stepSize(0) {
|
||||||
|
|
||||||
/* Open the ZIM file */
|
/* Open the ZIM file */
|
||||||
this->zimFileHandler = new zim::File(zimFilePath);
|
this->zimFileHandler = new zim::File(zimFilePath);
|
||||||
|
|
||||||
|
@ -123,19 +123,19 @@ namespace kiwix {
|
||||||
|
|
||||||
/* Index the title */
|
/* Index the title */
|
||||||
if (!this->htmlParser.title.empty()) {
|
if (!this->htmlParser.title.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str()),
|
indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str(), this->htmlParser.title.size()),
|
||||||
((this->htmlParser.dump.size() / 100) + 1) /
|
((this->htmlParser.dump.size() / 100) + 1) /
|
||||||
countWords(this->htmlParser.title) );
|
countWords(this->htmlParser.title) );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Index the keywords */
|
/* Index the keywords */
|
||||||
if (!this->htmlParser.keywords.empty()) {
|
if (!this->htmlParser.keywords.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str()), 3);
|
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str(), this->htmlParser.keywords.size()), 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Index the content */
|
/* Index the content */
|
||||||
if (!this->htmlParser.dump.empty()) {
|
if (!this->htmlParser.dump.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str()));
|
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str(), this->htmlParser.dump.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* add to the database */
|
/* add to the database */
|
||||||
|
@ -186,5 +186,4 @@ namespace kiwix {
|
||||||
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,7 @@ namespace kiwix {
|
||||||
|
|
||||||
/* Create the query term vector */
|
/* Create the query term vector */
|
||||||
/* I have the doublequote " because bug ID: 2939690 */
|
/* I have the doublequote " because bug ID: 2939690 */
|
||||||
std::vector<std::string> queryTerms = split(removeAccents(search.c_str()), " #@%$0/\\_-*()[]{},;:\"´`'");
|
std::vector<std::string> queryTerms = split(removeAccents(search.c_str(), search.size()), " #@%$0/\\_-*()[]{},;:\"´`'");
|
||||||
|
|
||||||
/* Create query object */
|
/* Create query object */
|
||||||
Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end());
|
Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end());
|
||||||
|
|
|
@ -1,21 +1,79 @@
|
||||||
|
/********************************************************************
|
||||||
|
* COPYRIGHT:
|
||||||
|
* Copyright (c) 1999-2003, International Business Machines Corporation and
|
||||||
|
* others. All Rights Reserved.
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
#include "unaccent.h"
|
#include "unaccent.h"
|
||||||
|
|
||||||
using namespace std;
|
const char UnaccentTransliterator::fgClassID = 0;
|
||||||
|
|
||||||
/* Remove accent */
|
/**
|
||||||
std::string removeAccents(const char *text = NULL) {
|
* Constructor
|
||||||
char* out = 0;
|
*/
|
||||||
size_t out_length = 0;
|
UnaccentTransliterator::UnaccentTransliterator() :
|
||||||
|
normalizer("", UNORM_NFD),
|
||||||
|
Transliterator("Unaccent", 0) {
|
||||||
|
}
|
||||||
|
|
||||||
if (!unac_string("UTF8", text, strlen(text), &out, &out_length)) {
|
/**
|
||||||
std::string textWithoutAccent = string(out, out_length);
|
* Destructor
|
||||||
free(out);
|
*/
|
||||||
return textWithoutAccent;
|
UnaccentTransliterator::~UnaccentTransliterator() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (text != NULL) {
|
/**
|
||||||
return text;
|
* Remove accents from a character using Normalizer.
|
||||||
|
*/
|
||||||
|
UChar UnaccentTransliterator::unaccent(UChar c) const {
|
||||||
|
UnicodeString str(c);
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UnaccentTransliterator* t = (UnaccentTransliterator*)this;
|
||||||
|
|
||||||
|
t->normalizer.setText(str, status);
|
||||||
|
if (U_FAILURE(status)) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return (UChar) t->normalizer.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implement Transliterator API
|
||||||
|
*/
|
||||||
|
void UnaccentTransliterator::handleTransliterate(Replaceable& text,
|
||||||
|
UTransPosition& index,
|
||||||
|
UBool incremental) const {
|
||||||
|
UnicodeString str("a");
|
||||||
|
while (index.start < index.limit) {
|
||||||
|
UChar c = text.charAt(index.start);
|
||||||
|
UChar d = unaccent(c);
|
||||||
|
if (c != d) {
|
||||||
|
str.setCharAt(0, d);
|
||||||
|
text.handleReplaceBetween(index.start, index.start+1, str);
|
||||||
|
}
|
||||||
|
index.start++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remove accents from a String */
|
||||||
|
UnaccentTransliterator unaccent;
|
||||||
|
char *unaccentedString = NULL;
|
||||||
|
unsigned unaccentedStringSize=0;
|
||||||
|
UnicodeString unicodeAccentedString;
|
||||||
|
|
||||||
|
const char* removeAccents(const char *accentedString, const unsigned size) {
|
||||||
|
|
||||||
|
/* Realloc memory if necessary */
|
||||||
|
if (size > unaccentedStringSize) {
|
||||||
|
unaccentedString = (char*)realloc(unaccentedString, size+1);
|
||||||
|
unaccentedStringSize = size+1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return "";
|
/* Transcode the String */
|
||||||
|
unicodeAccentedString = UnicodeString(accentedString);
|
||||||
|
unaccent.transliterate(unicodeAccentedString);
|
||||||
|
|
||||||
|
/* Extract and return the result */
|
||||||
|
unicodeAccentedString.extract(0, size, unaccentedString, size, "UTF-8");
|
||||||
|
return unaccentedString;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,98 @@
|
||||||
#include <stdlib.h>
|
/********************************************************************
|
||||||
#include <string>
|
* COPYRIGHT:
|
||||||
#include <stdio.h>
|
* Copyright (c) 1999-2003, International Business Machines Corporation and
|
||||||
#include <unac.h>
|
* others. All Rights Reserved.
|
||||||
#include <string.h>
|
********************************************************************/
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
std::string removeAccents(const char *text);
|
#include <unicode/translit.h>
|
||||||
|
#include <unicode/normlzr.h>
|
||||||
|
#include <unicode/unistr.h>
|
||||||
|
#include <unicode/rep.h>
|
||||||
|
#include <unicode/translit.h>
|
||||||
|
#include <unicode/uniset.h>
|
||||||
|
#include <unicode/bytestream.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class UnaccentTransliterator : public Transliterator {
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor
|
||||||
|
*/
|
||||||
|
UnaccentTransliterator();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destructor
|
||||||
|
*/
|
||||||
|
virtual ~UnaccentTransliterator();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implement Transliterator API
|
||||||
|
*/
|
||||||
|
virtual void handleTransliterate(Replaceable& text,
|
||||||
|
UTransPosition& index,
|
||||||
|
UBool incremental) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unaccent a single character using normalizer.
|
||||||
|
*/
|
||||||
|
UChar unaccent(UChar c) const;
|
||||||
|
|
||||||
|
Normalizer normalizer;
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the class ID for this class. This is useful only for
|
||||||
|
* comparing to a return value from getDynamicClassID(). For example:
|
||||||
|
* <pre>
|
||||||
|
* . Base* polymorphic_pointer = createPolymorphicObject();
|
||||||
|
* . if (polymorphic_pointer->getDynamicClassID() ==
|
||||||
|
* . Derived::getStaticClassID()) ...
|
||||||
|
* </pre>
|
||||||
|
* @return The class ID for all objects of this class.
|
||||||
|
* @stable ICU 2.0
|
||||||
|
*/
|
||||||
|
static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; };
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a unique class ID <b>polymorphically</b>. This method
|
||||||
|
* is to implement a simple version of RTTI, since not all C++
|
||||||
|
* compilers support genuine RTTI. Polymorphic operator==() and
|
||||||
|
* clone() methods call this method.
|
||||||
|
*
|
||||||
|
* <p>Concrete subclasses of Transliterator that wish clients to
|
||||||
|
* be able to identify them should implement getDynamicClassID()
|
||||||
|
* and also a static method and data member:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||||
|
* static char fgClassID;
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* Subclasses that do not implement this method will have a
|
||||||
|
* dynamic class ID of Transliterator::getStatisClassID().
|
||||||
|
*
|
||||||
|
* @return The class ID for this object. All objects of a given
|
||||||
|
* class have the same class ID. Objects of other classes have
|
||||||
|
* different class IDs.
|
||||||
|
* @stable ICU 2.0
|
||||||
|
*/
|
||||||
|
virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); };
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class identifier for subclasses of Transliterator that do not
|
||||||
|
* define their class (anonymous subclasses).
|
||||||
|
*/
|
||||||
|
static const char fgClassID;
|
||||||
|
};
|
||||||
|
|
||||||
|
const char* removeAccents(const char *, const unsigned);
|
||||||
|
|
Loading…
Reference in New Issue