From 27ee27bbe949e9f3848583096e6236af867f9909 Mon Sep 17 00:00:00 2001 From: kelson42 Date: Fri, 16 Apr 2010 12:28:42 +0000 Subject: [PATCH] + replace libunac by libicu --- src/common/kiwix/indexer.cpp | 9 ++- src/common/kiwix/searcher.cpp | 2 +- src/common/unaccent.cpp | 84 ++++++++++++++++++++++----- src/common/unaccent.h | 104 +++++++++++++++++++++++++++++++--- 4 files changed, 173 insertions(+), 26 deletions(-) diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp index 14cce4a6d..33b00f4f4 100644 --- a/src/common/kiwix/indexer.cpp +++ b/src/common/kiwix/indexer.cpp @@ -20,7 +20,7 @@ namespace kiwix { : zimFileHandler(NULL), articleCount(0), stepSize(0) { - + /* Open the ZIM file */ this->zimFileHandler = new zim::File(zimFilePath); @@ -123,19 +123,19 @@ namespace kiwix { /* Index the title */ if (!this->htmlParser.title.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str()), + indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str(), this->htmlParser.title.size()), ((this->htmlParser.dump.size() / 100) + 1) / countWords(this->htmlParser.title) ); } /* Index the keywords */ if (!this->htmlParser.keywords.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str()), 3); + indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str(), this->htmlParser.keywords.size()), 3); } /* Index the content */ if (!this->htmlParser.dump.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str())); + indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str(), this->htmlParser.dump.size())); } /* add to the database */ @@ -186,5 +186,4 @@ namespace kiwix { std::cout << "Read " << this->stopWords.size() << " lines.\n"; return true; } - } diff --git a/src/common/kiwix/searcher.cpp b/src/common/kiwix/searcher.cpp index d9c7105e1..817940978 100644 --- a/src/common/kiwix/searcher.cpp +++ b/src/common/kiwix/searcher.cpp @@ -36,7 +36,7 @@ namespace kiwix { /* Create the query term vector */ /* I have the doublequote " because bug ID: 2939690 */ - std::vector queryTerms = split(removeAccents(search.c_str()), " #@%$0/\\_-*()[]{},;:\"´`'"); + std::vector queryTerms = split(removeAccents(search.c_str(), search.size()), " #@%$0/\\_-*()[]{},;:\"´`'"); /* Create query object */ Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end()); diff --git a/src/common/unaccent.cpp b/src/common/unaccent.cpp index 5822e32fe..0dd27057c 100644 --- a/src/common/unaccent.cpp +++ b/src/common/unaccent.cpp @@ -1,21 +1,79 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2003, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + #include "unaccent.h" -using namespace std; +const char UnaccentTransliterator::fgClassID = 0; -/* Remove accent */ -std::string removeAccents(const char *text = NULL) { - char* out = 0; - size_t out_length = 0; +/** + * Constructor + */ +UnaccentTransliterator::UnaccentTransliterator() : + normalizer("", UNORM_NFD), + Transliterator("Unaccent", 0) { +} - if (!unac_string("UTF8", text, strlen(text), &out, &out_length)) { - std::string textWithoutAccent = string(out, out_length); - free(out); - return textWithoutAccent; - } +/** + * Destructor + */ +UnaccentTransliterator::~UnaccentTransliterator() { +} - if (text != NULL) { - return text; +/** + * Remove accents from a character using Normalizer. + */ +UChar UnaccentTransliterator::unaccent(UChar c) const { + UnicodeString str(c); + UErrorCode status = U_ZERO_ERROR; + UnaccentTransliterator* t = (UnaccentTransliterator*)this; + + t->normalizer.setText(str, status); + if (U_FAILURE(status)) { + return c; + } + return (UChar) t->normalizer.next(); +} + +/** + * Implement Transliterator API + */ +void UnaccentTransliterator::handleTransliterate(Replaceable& text, + UTransPosition& index, + UBool incremental) const { + UnicodeString str("a"); + while (index.start < index.limit) { + UChar c = text.charAt(index.start); + UChar d = unaccent(c); + if (c != d) { + str.setCharAt(0, d); + text.handleReplaceBetween(index.start, index.start+1, str); + } + index.start++; + } +} + +/* Remove accents from a String */ +UnaccentTransliterator unaccent; +char *unaccentedString = NULL; +unsigned unaccentedStringSize=0; +UnicodeString unicodeAccentedString; + +const char* removeAccents(const char *accentedString, const unsigned size) { + + /* Realloc memory if necessary */ + if (size > unaccentedStringSize) { + unaccentedString = (char*)realloc(unaccentedString, size+1); + unaccentedStringSize = size+1; } - return ""; + /* Transcode the String */ + unicodeAccentedString = UnicodeString(accentedString); + unaccent.transliterate(unicodeAccentedString); + + /* Extract and return the result */ + unicodeAccentedString.extract(0, size, unaccentedString, size, "UTF-8"); + return unaccentedString; } diff --git a/src/common/unaccent.h b/src/common/unaccent.h index 418f42833..c9de5309c 100644 --- a/src/common/unaccent.h +++ b/src/common/unaccent.h @@ -1,8 +1,98 @@ -#include -#include -#include -#include -#include -#include +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2003, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ -std::string removeAccents(const char *text); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class UnaccentTransliterator : public Transliterator { + + public: + + /** + * Constructor + */ + UnaccentTransliterator(); + + /** + * Destructor + */ + virtual ~UnaccentTransliterator(); + + protected: + + /** + * Implement Transliterator API + */ + virtual void handleTransliterate(Replaceable& text, + UTransPosition& index, + UBool incremental) const; + + private: + + /** + * Unaccent a single character using normalizer. + */ + UChar unaccent(UChar c) const; + + Normalizer normalizer; + +public: + + /** + * Return the class ID for this class. This is useful only for + * comparing to a return value from getDynamicClassID(). For example: + *
+     * .      Base* polymorphic_pointer = createPolymorphicObject();
+     * .      if (polymorphic_pointer->getDynamicClassID() ==
+     * .          Derived::getStaticClassID()) ...
+     * 
+ * @return The class ID for all objects of this class. + * @stable ICU 2.0 + */ + static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }; + + /** + * Returns a unique class ID polymorphically. This method + * is to implement a simple version of RTTI, since not all C++ + * compilers support genuine RTTI. Polymorphic operator==() and + * clone() methods call this method. + * + *

Concrete subclasses of Transliterator that wish clients to + * be able to identify them should implement getDynamicClassID() + * and also a static method and data member: + * + *

+     * static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+     * static char fgClassID;
+     * 
+ * + * Subclasses that do not implement this method will have a + * dynamic class ID of Transliterator::getStatisClassID(). + * + * @return The class ID for this object. All objects of a given + * class have the same class ID. Objects of other classes have + * different class IDs. + * @stable ICU 2.0 + */ + virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }; + +private: + + /** + * Class identifier for subclasses of Transliterator that do not + * define their class (anonymous subclasses). + */ + static const char fgClassID; +}; + +const char* removeAccents(const char *, const unsigned);