+ replace libunac by libicu

This commit is contained in:
kelson42 2010-04-16 12:28:42 +00:00
parent 40042efe06
commit 27ee27bbe9
4 changed files with 173 additions and 26 deletions

View File

@ -20,7 +20,7 @@ namespace kiwix {
: zimFileHandler(NULL), : zimFileHandler(NULL),
articleCount(0), articleCount(0),
stepSize(0) { stepSize(0) {
/* Open the ZIM file */ /* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath); this->zimFileHandler = new zim::File(zimFilePath);
@ -123,19 +123,19 @@ namespace kiwix {
/* Index the title */ /* Index the title */
if (!this->htmlParser.title.empty()) { if (!this->htmlParser.title.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str()), indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str(), this->htmlParser.title.size()),
((this->htmlParser.dump.size() / 100) + 1) / ((this->htmlParser.dump.size() / 100) + 1) /
countWords(this->htmlParser.title) ); countWords(this->htmlParser.title) );
} }
/* Index the keywords */ /* Index the keywords */
if (!this->htmlParser.keywords.empty()) { if (!this->htmlParser.keywords.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str()), 3); indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str(), this->htmlParser.keywords.size()), 3);
} }
/* Index the content */ /* Index the content */
if (!this->htmlParser.dump.empty()) { if (!this->htmlParser.dump.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str())); indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str(), this->htmlParser.dump.size()));
} }
/* add to the database */ /* add to the database */
@ -186,5 +186,4 @@ namespace kiwix {
std::cout << "Read " << this->stopWords.size() << " lines.\n"; std::cout << "Read " << this->stopWords.size() << " lines.\n";
return true; return true;
} }
} }

View File

@ -36,7 +36,7 @@ namespace kiwix {
/* Create the query term vector */ /* Create the query term vector */
/* I have the doublequote " because bug ID: 2939690 */ /* I have the doublequote " because bug ID: 2939690 */
std::vector<std::string> queryTerms = split(removeAccents(search.c_str()), " #@%$0/\\_-*()[]{},;:\"´`'"); std::vector<std::string> queryTerms = split(removeAccents(search.c_str(), search.size()), " #@%$0/\\_-*()[]{},;:\"´`'");
/* Create query object */ /* Create query object */
Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end()); Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end());

View File

@ -1,21 +1,79 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2003, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#include "unaccent.h" #include "unaccent.h"
using namespace std; const char UnaccentTransliterator::fgClassID = 0;
/* Remove accent */ /**
std::string removeAccents(const char *text = NULL) { * Constructor
char* out = 0; */
size_t out_length = 0; UnaccentTransliterator::UnaccentTransliterator() :
normalizer("", UNORM_NFD),
Transliterator("Unaccent", 0) {
}
if (!unac_string("UTF8", text, strlen(text), &out, &out_length)) { /**
std::string textWithoutAccent = string(out, out_length); * Destructor
free(out); */
return textWithoutAccent; UnaccentTransliterator::~UnaccentTransliterator() {
} }
if (text != NULL) { /**
return text; * Remove accents from a character using Normalizer.
*/
UChar UnaccentTransliterator::unaccent(UChar c) const {
UnicodeString str(c);
UErrorCode status = U_ZERO_ERROR;
UnaccentTransliterator* t = (UnaccentTransliterator*)this;
t->normalizer.setText(str, status);
if (U_FAILURE(status)) {
return c;
}
return (UChar) t->normalizer.next();
}
/**
* Implement Transliterator API
*/
void UnaccentTransliterator::handleTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental) const {
UnicodeString str("a");
while (index.start < index.limit) {
UChar c = text.charAt(index.start);
UChar d = unaccent(c);
if (c != d) {
str.setCharAt(0, d);
text.handleReplaceBetween(index.start, index.start+1, str);
}
index.start++;
}
}
/* Remove accents from a String */
UnaccentTransliterator unaccent;
char *unaccentedString = NULL;
unsigned unaccentedStringSize=0;
UnicodeString unicodeAccentedString;
const char* removeAccents(const char *accentedString, const unsigned size) {
/* Realloc memory if necessary */
if (size > unaccentedStringSize) {
unaccentedString = (char*)realloc(unaccentedString, size+1);
unaccentedStringSize = size+1;
} }
return ""; /* Transcode the String */
unicodeAccentedString = UnicodeString(accentedString);
unaccent.transliterate(unicodeAccentedString);
/* Extract and return the result */
unicodeAccentedString.extract(0, size, unaccentedString, size, "UTF-8");
return unaccentedString;
} }

View File

@ -1,8 +1,98 @@
#include <stdlib.h> /********************************************************************
#include <string> * COPYRIGHT:
#include <stdio.h> * Copyright (c) 1999-2003, International Business Machines Corporation and
#include <unac.h> * others. All Rights Reserved.
#include <string.h> ********************************************************************/
#include <iostream>
std::string removeAccents(const char *text); #include <unicode/translit.h>
#include <unicode/normlzr.h>
#include <unicode/unistr.h>
#include <unicode/rep.h>
#include <unicode/translit.h>
#include <unicode/uniset.h>
#include <unicode/bytestream.h>
#include <iostream>
#include <string>
class UnaccentTransliterator : public Transliterator {
public:
/**
* Constructor
*/
UnaccentTransliterator();
/**
* Destructor
*/
virtual ~UnaccentTransliterator();
protected:
/**
* Implement Transliterator API
*/
virtual void handleTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental) const;
private:
/**
* Unaccent a single character using normalizer.
*/
UChar unaccent(UChar c) const;
Normalizer normalizer;
public:
/**
* Return the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
* <pre>
* . Base* polymorphic_pointer = createPolymorphicObject();
* . if (polymorphic_pointer->getDynamicClassID() ==
* . Derived::getStaticClassID()) ...
* </pre>
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; };
/**
* Returns a unique class ID <b>polymorphically</b>. This method
* is to implement a simple version of RTTI, since not all C++
* compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* <p>Concrete subclasses of Transliterator that wish clients to
* be able to identify them should implement getDynamicClassID()
* and also a static method and data member:
*
* <pre>
* static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
* static char fgClassID;
* </pre>
*
* Subclasses that do not implement this method will have a
* dynamic class ID of Transliterator::getStatisClassID().
*
* @return The class ID for this object. All objects of a given
* class have the same class ID. Objects of other classes have
* different class IDs.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); };
private:
/**
* Class identifier for subclasses of Transliterator that do not
* define their class (anonymous subclasses).
*/
static const char fgClassID;
};
const char* removeAccents(const char *, const unsigned);