From 27ee27bbe949e9f3848583096e6236af867f9909 Mon Sep 17 00:00:00 2001
From: kelson42 <kelson42@users.sourceforge.net>
Date: Fri, 16 Apr 2010 12:28:42 +0000
Subject: [PATCH] + replace libunac by libicu

---
 src/common/kiwix/indexer.cpp  |   9 ++-
 src/common/kiwix/searcher.cpp |   2 +-
 src/common/unaccent.cpp       |  84 ++++++++++++++++++++++-----
 src/common/unaccent.h         | 104 +++++++++++++++++++++++++++++++---
 4 files changed, 173 insertions(+), 26 deletions(-)

diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp
index 14cce4a6d..33b00f4f4 100644
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@@ -20,7 +20,7 @@ namespace kiwix {
     : zimFileHandler(NULL), 
       articleCount(0), 
       stepSize(0) {
-    
+
     /* Open the ZIM file */
     this->zimFileHandler = new zim::File(zimFilePath);
     
@@ -123,19 +123,19 @@ namespace kiwix {
 	  
 	  /* Index the title */
 	  if (!this->htmlParser.title.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str()), 
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str(), this->htmlParser.title.size()), 
 						 ((this->htmlParser.dump.size() / 100) + 1) / 
 						 countWords(this->htmlParser.title) );
 	  }
 	  
 	  /* Index the keywords */
 	  if (!this->htmlParser.keywords.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str()), 3);
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str(), this->htmlParser.keywords.size()), 3);
 	  }
 	  
 	  /* Index the content */
 	  if (!this->htmlParser.dump.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str()));
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str(), this->htmlParser.dump.size()));
 	  }
 	  
 	  /* add to the database */
@@ -186,5 +186,4 @@ namespace kiwix {
     std::cout << "Read " << this->stopWords.size() << " lines.\n";
     return true;
   }
-  
 }
diff --git a/src/common/kiwix/searcher.cpp b/src/common/kiwix/searcher.cpp
index d9c7105e1..817940978 100644
--- a/src/common/kiwix/searcher.cpp
+++ b/src/common/kiwix/searcher.cpp
@@ -36,7 +36,7 @@ namespace kiwix {
     
     /* Create the query term vector */
     /* I have the doublequote " because bug ID: 2939690 */
-    std::vector<std::string> queryTerms = split(removeAccents(search.c_str()), " #@%$0/\\_-*()[]{},;:\"´`'");
+    std::vector<std::string> queryTerms = split(removeAccents(search.c_str(), search.size()), " #@%$0/\\_-*()[]{},;:\"´`'");
     
     /* Create query object */
     Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end());
diff --git a/src/common/unaccent.cpp b/src/common/unaccent.cpp
index 5822e32fe..0dd27057c 100644
--- a/src/common/unaccent.cpp
+++ b/src/common/unaccent.cpp
@@ -1,21 +1,79 @@
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 1999-2003, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+
 #include "unaccent.h"
 
-using namespace std;
+const char UnaccentTransliterator::fgClassID = 0;
 
-/* Remove accent */
-std::string removeAccents(const char *text = NULL) { 
-  char* out = 0;
-  size_t out_length = 0;
+/**
+ * Constructor
+ */
+UnaccentTransliterator::UnaccentTransliterator() :
+    normalizer("", UNORM_NFD),
+    Transliterator("Unaccent", 0) {
+}
 
-  if (!unac_string("UTF8", text, strlen(text), &out, &out_length)) {
-    std::string textWithoutAccent = string(out, out_length);
-    free(out);
-    return textWithoutAccent;
-  } 
+/**
+ * Destructor
+ */
+UnaccentTransliterator::~UnaccentTransliterator() {
+}
 
-  if (text != NULL) {
-    return text;
+/**
+ * Remove accents from a character using Normalizer.
+ */
+UChar UnaccentTransliterator::unaccent(UChar c) const {
+    UnicodeString str(c);
+    UErrorCode status = U_ZERO_ERROR;
+    UnaccentTransliterator* t = (UnaccentTransliterator*)this;
+
+    t->normalizer.setText(str, status);
+    if (U_FAILURE(status)) {
+        return c;
+    }
+    return (UChar) t->normalizer.next();
+}
+
+/**
+ * Implement Transliterator API
+ */
+void UnaccentTransliterator::handleTransliterate(Replaceable& text,
+                                                 UTransPosition& index,
+                                                 UBool incremental) const {
+    UnicodeString str("a");
+    while (index.start < index.limit) {
+        UChar c = text.charAt(index.start);
+        UChar d = unaccent(c);
+        if (c != d) {
+            str.setCharAt(0, d);
+            text.handleReplaceBetween(index.start, index.start+1, str);
+        }
+        index.start++;
+    }
+}
+
+/* Remove accents from a String */
+UnaccentTransliterator unaccent;
+char *unaccentedString = NULL;
+unsigned unaccentedStringSize=0;
+UnicodeString unicodeAccentedString;
+
+const char* removeAccents(const char *accentedString, const unsigned size) {
+
+  /* Realloc memory if necessary */
+  if (size > unaccentedStringSize) {
+    unaccentedString = (char*)realloc(unaccentedString, size+1);
+    unaccentedStringSize = size+1;
   }
 
-  return "";
+  /* Transcode the String */
+  unicodeAccentedString = UnicodeString(accentedString);
+  unaccent.transliterate(unicodeAccentedString);
+  
+  /* Extract and return the result */
+  unicodeAccentedString.extract(0, size, unaccentedString, size, "UTF-8");
+  return unaccentedString;
 }
diff --git a/src/common/unaccent.h b/src/common/unaccent.h
index 418f42833..c9de5309c 100644
--- a/src/common/unaccent.h
+++ b/src/common/unaccent.h
@@ -1,8 +1,98 @@
-#include <stdlib.h>
-#include <string>
-#include <stdio.h>
-#include <unac.h>
-#include <string.h>
-#include <iostream>
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 1999-2003, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
 
-std::string removeAccents(const char *text);
+#include <unicode/translit.h>
+#include <unicode/normlzr.h>
+#include <unicode/unistr.h>
+#include <unicode/rep.h>
+#include <unicode/translit.h>
+#include <unicode/uniset.h>
+#include <unicode/bytestream.h>
+#include <iostream>
+#include <string>
+
+class UnaccentTransliterator : public Transliterator {
+    
+ public:
+    
+    /**
+     * Constructor
+     */
+    UnaccentTransliterator();
+
+    /**
+     * Destructor
+     */
+    virtual ~UnaccentTransliterator();
+
+ protected:
+
+    /**
+     * Implement Transliterator API
+     */
+    virtual void handleTransliterate(Replaceable& text,
+                                     UTransPosition& index,
+                                     UBool incremental) const;
+
+ private:
+
+    /**
+     * Unaccent a single character using normalizer.
+     */
+    UChar unaccent(UChar c) const;
+
+    Normalizer normalizer;
+
+public:
+
+    /**
+     * Return the class ID for this class.  This is useful only for
+     * comparing to a return value from getDynamicClassID().  For example:
+     * <pre>
+     * .      Base* polymorphic_pointer = createPolymorphicObject();
+     * .      if (polymorphic_pointer->getDynamicClassID() ==
+     * .          Derived::getStaticClassID()) ...
+     * </pre>
+     * @return          The class ID for all objects of this class.
+     * @stable ICU 2.0
+     */
+    static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; };
+
+    /**
+     * Returns a unique class ID <b>polymorphically</b>.  This method
+     * is to implement a simple version of RTTI, since not all C++
+     * compilers support genuine RTTI.  Polymorphic operator==() and
+     * clone() methods call this method.
+     * 
+     * <p>Concrete subclasses of Transliterator that wish clients to
+     * be able to identify them should implement getDynamicClassID()
+     * and also a static method and data member:
+     * 
+     * <pre>
+     * static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+     * static char fgClassID;
+     * </pre>
+     *
+     * Subclasses that do not implement this method will have a
+     * dynamic class ID of Transliterator::getStatisClassID().
+     *
+     * @return The class ID for this object. All objects of a given
+     * class have the same class ID.  Objects of other classes have
+     * different class IDs.
+     * @stable ICU 2.0
+     */
+    virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); };
+
+private:
+
+    /**
+     * Class identifier for subclasses of Transliterator that do not
+     * define their class (anonymous subclasses).
+     */
+    static const char fgClassID;
+};
+
+const char* removeAccents(const char *, const unsigned);