src/foundation/ZUnicode.h

00001 /*  @(#) $Id: ZUnicode.h,v 1.39 2007/04/01 12:49:37 agreen Exp $ */
00002 
00003 /* ------------------------------------------------------------
00004 Copyright (c) 2001 Andrew Green and Learning in Motion, Inc.
00005 http://www.zoolib.org
00006 
00007 Permission is hereby granted, free of charge, to any person obtaining a copy
00008 of this software and associated documentation files (the "Software"), to deal
00009 in the Software without restriction, including without limitation the rights
00010 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00011 copies of the Software, and to permit persons to whom the Software is
00012 furnished to do so, subject to the following conditions:
00013 
00014 The above copyright notice and this permission notice shall be included in
00015 all copies or substantial portions of the Software.
00016 
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00018 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00019 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00020 COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
00021 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
00022 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00023 ------------------------------------------------------------ */
00024 
00025 #ifndef __ZUnicode__
00026 #define __ZUnicode__ 1
00027 #include "zconfig.h"
00028 
00029 #include "ZTypes.h"
00030 #include <string>
00031 
00032 // =================================================================================================
00033 
00034 namespace ZUnicode {
00035 // This template and its two specializations below let us
00036 // determine cleanly whether wchar_t is 16 or 32 bits.
00037 template <int s> struct Types_T {};
00038 
00039 template <> struct Types_T<4>
00040         {
00041         typedef wchar_t utf32_t;
00042         typedef uint16 utf16_t;
00043         };
00044 
00045 template <> struct Types_T<2>
00046         {
00047         typedef uint32 utf32_t;
00048         typedef wchar_t utf16_t;
00049         };
00050 } // namespace ZUnicode
00051 
00052 // Definitions of UTF32, UTF16 and UTF8
00053 typedef ZUnicode::Types_T<sizeof(wchar_t)>::utf32_t UTF32;
00054 typedef ZUnicode::Types_T<sizeof(wchar_t)>::utf16_t UTF16;
00055 typedef char UTF8;
00056 
00058 typedef std::basic_string<UTF32> string32;
00059 
00061 typedef std::basic_string<UTF16> string16;
00062 
00065 typedef std::basic_string<UTF8> string8;
00066 
00067 namespace ZUnicode {//@{
00070 extern const uint8 sUTF8SequenceLength[256];
00071 extern const uint8 sUTF8StartByteMark[7];
00072 extern const uint8 sUTF8StartByteMask[7];
00073 
00075 static const UTF32 kCPEOF = UTF32(-1);
00076 
00078 static const UTF32 kCPMaxUCS2 = 0xFFFFul;
00079 
00081 static const UTF32 kCPMaxUTF = 0x10FFFFul;
00082 
00084 static const UTF32 kCPMaxUCS4 = 0x7FFFFFFFul;
00085 
00087 static const UTF32 kCPReplacement = 0xFFFDul;
00088 
00090 static const UTF32 kCPSurrogateHighBegin = 0xD800ul;
00091 static const UTF32 kCPSurrogateHighEnd  = 0xDC00ul;
00092 static const UTF32 kCPSurrogateLowBegin = 0xDC00ul;
00093 static const UTF32 kCPSurrogateLowEnd = 0xE000ul;
00095 
00096 /*
00097 We've got four ranges to deal with:
00098  0x0000 -   0xD7FF Small normal
00099  0xD800 -   0xDBFF High surrogate
00100  0xDC00 -   0xDFFF Low surrogate
00101  0xE000 -   0xFFFF Big normal
00102 0x10000 - 0x10FFFF Big normal, but requiring more than 16 bits to represent.
00103 
00104 We also have to beware that UTF16 and UTF32 may be signed types, so these methods take
00105 uint32s and coerce the boundary constants to ensure they're also uint32. Callers
00106 should work with uint32s, being careful to prevent sign-extension when they load
00107 from smaller sources.
00108 */
00109 
00110 inline bool sIsValidCP(uint32 iCP)
00111         {
00112         return iCP < kCPSurrogateHighBegin || (iCP >= kCPSurrogateLowEnd && iCP <= kCPMaxUTF);
00113         }
00114 
00115 inline UTF32 sUTF32FromSurrogates(uint32 hi, uint32 lo)
00116         {
00117         static const int kSurrogateShift = 10;
00118         static const uint32 kSurrogateBase = 0x10000ul;
00119         return kSurrogateBase
00120                 + ((hi - uint32(kCPSurrogateHighBegin)) << kSurrogateShift)
00121                 + (lo - uint32(kCPSurrogateLowBegin));
00122         }
00123 
00124 inline bool sIsLowSurrogate(uint32 iCU)
00125         {
00126         return iCU >= uint32(kCPSurrogateLowBegin) && iCU < uint32(kCPSurrogateLowEnd);
00127         }
00128 
00129 inline bool sIsSmallNormal(uint32 iCU)
00130         {
00131         return iCU < uint32(kCPSurrogateHighBegin);
00132         }
00133 
00134 inline bool sIsSmallNormalOrHighSurrogate(uint32 iCU)
00135         {
00136         return iCU < uint32(kCPSurrogateLowBegin);
00137         }
00138 
00139 inline bool sIsBigNormalOrBeyond(uint32 iCU)
00140         {
00141         return iCU >= uint32(kCPSurrogateLowEnd);
00142         }
00143 
00144 inline bool sIsContinuation(uint8 iCU)
00145         {
00146         return (iCU & 0xC0) == 0x80;
00147         }
00148 
00149 inline void sAppendContinuation(uint32& ioCP, uint8 iContinuation)
00150         {
00151         ioCP = (ioCP << 6) + (iContinuation & 0x3F);
00152         }
00153 } // namespace ZUnicode
00154 
00155 // =================================================================================================
00156 
00158 string16 operator+(UTF32 iCP, const string16& iString);
00159 
00161 string16& operator+=(string16& ioString, UTF32 iCP);
00162 
00164 inline string16 operator+(const string16& iString, UTF32 iCP)
00165         {
00166         string16 temp = iString;
00167         return temp += iCP;
00168         }
00169 
00170 
00172 string8 operator+(UTF32 iCP, const string8& iString);
00173 
00175 string8& operator+=(string8& ioString, UTF32 iCP);
00176 
00178 inline string8 operator+(const string8& iString, UTF32 iCP)
00179         {
00180         string8 temp = iString;
00181         return temp += iCP;
00182         }
00183 
00184 // =================================================================================================
00185 #pragma mark -
00186 #pragma mark * Include gnarly template stuff
00187 
00188 #include "ZUnicodePriv.h"
00189 
00190 // =================================================================================================
00191 #pragma mark -
00192 #pragma mark * ZUnicode
00193 
00194 namespace ZUnicode {
00195 
00196 /* These are all template functions, they'll take anything that behaves like a string::iterator
00197 or a pointer. They call through to static member functions of template structs declared in
00198 ZUnicodePriv.h.The actual code for the three different code unit types is in ZUnicodePrivB.h,
00199 and we do explicit template instantiations of the structs in ZUnicode.cpp. So if you have a new
00200 kind of iterator you'll need to do something similar in your code. */
00201 //@{
00206 template <class I>
00207 inline size_t sCountCU(I iSource)
00208         { return Functions_CountCU_T<I>::sCountCU(iSource); }
00209 
00212 template <class I>
00213 inline size_t sCountCP(I iSource)
00214         { return Functions_Count_T<I>::sCountCP(iSource); }
00215 
00218 template <class I>
00219 inline void sCount(I iSource, size_t* oCountCU, size_t* oCountCP)
00220         { Functions_Count_T<I>::sCount(iSource, oCountCU, oCountCP); }
00222 
00223 //@{
00228 template <class I>
00229 inline size_t sCPToCU(I iSource, size_t iCountCP)
00230         { return Functions_Count_T<I>::sCPToCU(iSource, iCountCP); }
00231 
00235 template <class I>
00236 inline size_t sCPToCU(I iSource, size_t iCountCU, size_t iCountCP, size_t* oCountCP)
00237         { return Functions_Count_T<I>::sCPToCU(iSource, iCountCU, iCountCP, oCountCP); }
00238 
00242 template <class I>
00243 inline size_t sCPToCU(I iSource, I iEnd, size_t iCountCP, size_t* oCountCP)
00244         { return Functions_Count_T<I>::sCPToCU(iSource, iSource, iCountCP, oCountCP); }
00245 
00248 template <class I>
00249 inline size_t sCUToCP(I iSource, size_t iCountCU)
00250         { return Functions_Count_T<I>::sCUToCP(iSource, iCountCU); }
00251 
00254 template <class I>
00255 inline size_t sCUToCP(I iSource, I iEnd)
00256         { return Functions_Count_T<I>::sCUToCP(iSource, iEnd); }
00258 
00259 //@{
00264 template <class I>
00265 inline void sAlign(I& ioCurrent)
00266         { Functions_Read_T<I>::sAlign(ioCurrent); }
00267 
00270 template <class I>
00271 inline void sAlign(I& ioCurrent, I iEnd)
00272         { Functions_Read_T<I>::sAlign(ioCurrent, iEnd); }
00274 
00275 //@{
00279 template <class I>
00280 inline void sInc(I& ioCurrent)
00281         { return Functions_Read_T<I>::sInc(ioCurrent); }
00282 
00286 template <class I>
00287 inline bool sInc(I& ioCurrent, I iEnd)
00288         { return Functions_Read_T<I>::sInc(ioCurrent, iEnd); }
00289 
00291 template <class I>
00292 inline void sDec(I& ioCurrent)
00293         { return Functions_Read_T<I>::sDec(ioCurrent); }
00294 
00299 template <class I>
00300 inline bool sDec(I iStart, I& ioCurrent, I iEnd)
00301         { return Functions_Read_T<I>::sDec(iStart, ioCurrent, iEnd); }
00302 
00304 template <class I>
00305 inline UTF32 sRead(I iCurrent)
00306         { return Functions_Read_T<I>::sRead(iCurrent); }
00307 
00310 template <class I>
00311 inline bool sRead(I iCurrent, I iEnd, UTF32& oCP)
00312         { return Functions_Read_T<I>::sRead(iCurrent, iEnd, oCP); }
00313 
00317 template <class I>
00318 inline UTF32 sReadInc(I& ioCurrent)
00319         { return Functions_Read_T<I>::sReadInc(ioCurrent); }
00320 
00325 template <class I>
00326 inline bool sReadInc(I& ioCurrent, I iEnd, UTF32& oCP)
00327         { return Functions_Read_T<I>::sReadInc(ioCurrent, iEnd, oCP); }
00328 
00335 template <class I>
00336 inline bool sReadInc(I& ioCurrent, I iEnd, UTF32& oCP, size_t& ioCountSkipped)
00337         { return Functions_Read_T<I>::sReadInc(ioCurrent, iEnd, oCP, ioCountSkipped); }
00338 
00340 template <class I>
00341 inline UTF32 sDecRead(I& ioCurrent)
00342         { return Functions_Read_T<I>::sDecRead(ioCurrent); }
00343 
00346 template <class I>
00347 inline bool sDecRead(I iStart, I& ioCurrent, I iEnd, UTF32& oCP)
00348         { return Functions_Read_T<I>::sDecRead(iStart, ioCurrent, iEnd, oCP); }
00349 
00353 template <class I>
00354 inline bool sWrite(I iDest, I iEnd, UTF32 iCP)
00355         { return Functions_Write_T<I>::sWrite(iDest, iEnd, iCP); }
00356 
00361 template <class I>
00362 inline bool sWriteInc(I& ioDest, I iEnd, UTF32 iCP)
00363         { return Functions_Write_T<I>::sWriteInc(ioDest, iEnd, iCP); }
00365 
00366 //@{
00369 template <class I>
00370 inline string32 sAsUTF32(I iSource, size_t iCountCU)
00371         { return Functions_Convert_T<I>::sAsUTF32(iSource, iCountCU); }
00372 
00373 template <class I>
00374 inline string32 sAsUTF32(I iSource, I iEnd)
00375         { return Functions_Convert_T<I>::sAsUTF32(iSource, iEnd); }
00376 
00377 inline string32 sAsUTF32(const UTF32* iString)
00378         { return string32(iString); }
00379 
00380 inline string32 sAsUTF32(const UTF32* iString, size_t iCountCU)
00381         { return string32(iString, iCountCU); }
00382 
00383 inline string32 sAsUTF32(const string32& iString)
00384         { return iString; }
00385 
00386 inline string32 sAsUTF32(const string16& iString)
00387         {
00388         return Functions_Convert_T<string16::const_iterator>::sAsUTF32(iString.begin(), iString.size());
00389         }
00390 
00391 inline string32 sAsUTF32(const string8& iString)
00392         {
00393         return Functions_Convert_T<string8::const_iterator>::sAsUTF32(iString.begin(), iString.size());
00394         }
00396 
00397 //@{
00400 template <class I>
00401 inline string16 sAsUTF16(I iSource, size_t iCountCU)
00402         { return Functions_Convert_T<I>::sAsUTF16(iSource, iCountCU); }
00403 
00404 template <class I>
00405 inline string16 sAsUTF16(I iSource, I iEnd)
00406         { return Functions_Convert_T<I>::sAsUTF16(iSource, iEnd); }
00407 
00408 inline string16 sAsUTF16(const UTF16* iString)
00409         { return string16(iString); }
00410 
00411 inline string16 sAsUTF16(const UTF16* iString, size_t iCountCU)
00412         { return string16(iString, iCountCU); }
00413 
00414 inline string16 sAsUTF16(const string32& iString)
00415         {
00416         return Functions_Convert_T<string32::const_iterator>::sAsUTF16(iString.begin(), iString.size());
00417         }
00418 
00419 inline string16 sAsUTF16(const string16& iString)
00420         { return iString; }
00421 
00422 #if 1
00423 string16 sAsUTF16(const string8& iString);
00424 #else
00425 inline string16 sAsUTF16(const string8& iString)
00426         {
00427         return Functions_Convert_T<string8::const_iterator>::sAsUTF16(iString.begin(), iString.size());
00428         }
00429 #endif
00430 
00431 
00432 //@{
00435 template <class I>
00436 inline string8 sAsUTF8(I iSource, size_t iCountCU)
00437         { return Functions_Convert_T<I>::sAsUTF8(iSource, iCountCU); }
00438 
00439 template <class I>
00440 inline string8 sAsUTF8(I iSource, I iEnd)
00441         { return Functions_Convert_T<I>::sAsUTF8(iSource, iEnd); }
00442 
00443 inline string8 sAsUTF8(const UTF8* iString)
00444         { return string8(iString); }
00445 
00446 inline string8 sAsUTF8(const UTF8* iString, size_t iCountCU)
00447         { return string8(iString, iCountCU); }
00448 
00449 inline string8 sAsUTF8(const string32& iString)
00450         {
00451         return Functions_Convert_T<string32::const_iterator>::sAsUTF8(iString.begin(), iString.size());
00452         }
00453 
00454 inline string8 sAsUTF8(const string16& iString)
00455         {
00456         return Functions_Convert_T<string16::const_iterator>::sAsUTF8(iString.begin(), iString.size());
00457         }
00458 
00459 inline string8 sAsUTF8(const string8& iString)
00460         { return iString; }
00462 
00463 //@{
00466 void sUTF32ToUTF8(const UTF32* iSource, size_t iSourceCount,
00467                                                 size_t* oSourceCount,
00468                                                 UTF8* iDest, size_t iDestCU,
00469                                                 size_t* oDestCU, size_t* oCountCP);
00470 
00471 void sUTF32ToUTF8(const UTF32* iSource, size_t iSourceCount,
00472                                                 size_t* oSourceCount, size_t* oSourceCountSkipped,
00473                                                 UTF8* iDest, size_t iDestCU,
00474                                                 size_t* oDestCU, size_t* oCountCP);
00475 
00476 
00477 bool sUTF8ToUTF32(const UTF8* iSource, size_t iSourceCU,
00478                                                 size_t* oSourceCU,
00479                                                 UTF32* iDest, size_t iDestCount,
00480                                                 size_t* oDestCount);
00481 
00482 bool sUTF8ToUTF32(const UTF8* iSource, size_t iSourceCU,
00483                                                 size_t* oSourceCU, size_t* oSourceCUSkipped,
00484                                                 UTF32* iDest, size_t iDestCount,
00485                                                 size_t* oDestCount);
00486 
00487 
00488 void sUTF32ToUTF16(const UTF32* iSource, size_t iSourceCount,
00489                                                 size_t* oSourceCount,
00490                                                 UTF16* iDest, size_t iDestCU,
00491                                                 size_t* oDestCU, size_t* oCountCP);
00492 
00493 void sUTF32ToUTF16(const UTF32* iSource, size_t iSourceCount,
00494                                                 size_t* oSourceCount, size_t* oSourceCountSkipped,
00495                                                 UTF16* iDest, size_t iDestCU,
00496                                                 size_t* oDestCU, size_t* oCountCP);
00497 
00498 
00499 bool sUTF16ToUTF32(const UTF16* iSource, size_t iSourceCU,
00500                                                 size_t* oSourceCU,
00501                                                 UTF32* iDest, size_t iDestCount,
00502                                                 size_t* oDestCount);
00503 
00504 bool sUTF16ToUTF32(const UTF16* iSource, size_t iSourceCU,
00505                                                 size_t* oSourceCU, size_t* oSourceCUSkipped,
00506                                                 UTF32* iDest, size_t iDestCount,
00507                                                 size_t* oDestCount);
00508 
00509 
00510 bool sUTF16ToUTF8(const UTF16* iSource, size_t iSourceCU,
00511                                                 size_t* oSourceCU,
00512                                                 UTF8* iDest, size_t iDestCU,
00513                                                 size_t* oDestCU, size_t iMaxCP, size_t* oCountCP);
00514 
00515 bool sUTF16ToUTF8(const UTF16* iSource, size_t iSourceCU,
00516                                                 size_t* oSourceCU, size_t* oSourceCUSkipped,
00517                                                 UTF8* iDest, size_t iDestCU,
00518                                                 size_t* oDestCU, size_t iMaxCP, size_t* oCountCP);
00519 
00520 
00521 bool sUTF8ToUTF16(const UTF8* iSource, size_t iSourceCU,
00522                                                 size_t* oSourceCU,
00523                                                 UTF16* iDest, size_t iDestCU,
00524                                                 size_t* oDestCU, size_t iMaxCP, size_t* oCountCP);
00525 
00526 bool sUTF8ToUTF16(const UTF8* iSource, size_t iSourceCU,
00527                                                 size_t* oSourceCU, size_t* oSourceCUSkipped,
00528                                                 UTF16* iDest, size_t iDestCU,
00529                                                 size_t* oDestCU, size_t iMaxCP, size_t* oCountCP);
00531 
00532 // ==================================================//@{
00535 bool sIsValid(UTF32 iCP);
00536 bool sIsAlpha(UTF32 iCP);
00537 bool sIsDigit(UTF32 iCP);
00538 bool sIsAlphaDigit(UTF32 iCP);
00539 bool sIsWhitespace(UTF32 iCP);
00540 bool sIsEOL(UTF32 iCP);
00542 
00543 // ==================================================//@{
00546 UTF32 sToUpper(UTF32 iCP);
00547 UTF32 sToLower(UTF32 iCP);
00548 
00549 string8 sToLower(const string8& iString);
00550 string8 sToUpper(const string8& iString);
00551 
00552 int sHexValue(UTF32 iCP);
00554 
00555 } // namespace ZUnicode
00556 
00557 #endif // __ZUnicode__

Generated on Thu Jul 26 11:21:52 2007 for ZooLib by  doxygen 1.4.7