Merge "Split UTF functions from String8/16"

This commit is contained in:
Kenny Root 2010-11-15 11:11:22 -08:00 committed by Android (Google) Code Review
commit 7986fe5035
10 changed files with 986 additions and 767 deletions

View File

@ -19,39 +19,12 @@
#include <utils/Errors.h>
#include <utils/SharedBuffer.h>
#include <stdint.h>
#include <sys/types.h>
#include <utils/Unicode.h>
// ---------------------------------------------------------------------------
extern "C" {
typedef uint16_t char16_t;
// Standard string functions on char16 strings.
int strcmp16(const char16_t *, const char16_t *);
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n);
size_t strlen16(const char16_t *);
size_t strnlen16(const char16_t *, size_t);
char16_t *strcpy16(char16_t *, const char16_t *);
char16_t *strncpy16(char16_t *, const char16_t *, size_t);
// Version of comparison that supports embedded nulls.
// This is different than strncmp() because we don't stop
// at a nul character and consider the strings to be different
// if the lengths are different (thus we need to supply the
// lengths of both strings). This can also be used when
// your string is not nul-terminated as it will have the
// equivalent result as strcmp16 (unlike strncmp16).
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2);
// Version of strzcmp16 for comparing strings in different endianness.
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);
// Convert UTF-8 to UTF-16 including surrogate pairs
void utf8_to_utf16(const uint8_t *src, size_t srcLen, char16_t* dst, const size_t dstLen);
}
// ---------------------------------------------------------------------------

View File

@ -18,122 +18,16 @@
#define ANDROID_STRING8_H
#include <utils/Errors.h>
#include <utils/SharedBuffer.h>
#include <utils/Unicode.h>
// Need this for the char16_t type; String8.h should not
// be depedent on the String16 class.
#include <utils/String16.h>
#include <stdint.h>
#include <string.h>
#include <sys/types.h>
// ---------------------------------------------------------------------------
extern "C" {
typedef uint32_t char32_t;
size_t strlen32(const char32_t *);
size_t strnlen32(const char32_t *, size_t);
/*
* Returns the length of "src" when "src" is valid UTF-8 string.
* Returns 0 if src is NULL, 0-length string or non UTF-8 string.
* This function should be used to determine whether "src" is valid UTF-8
* characters with valid unicode codepoints. "src" must be null-terminated.
*
* If you are going to use other GetUtf... functions defined in this header
* with string which may not be valid UTF-8 with valid codepoint (form 0 to
* 0x10FFFF), you should use this function before calling others, since the
* other functions do not check whether the string is valid UTF-8 or not.
*
* If you do not care whether "src" is valid UTF-8 or not, you should use
* strlen() as usual, which should be much faster.
*/
size_t utf8_length(const char *src);
/*
* Returns the UTF-32 length of "src".
*/
size_t utf32_length(const char *src, size_t src_len);
/*
* Returns the UTF-8 length of "src".
*/
size_t utf8_length_from_utf16(const char16_t *src, size_t src_len);
/*
* Returns the UTF-8 length of "src".
*/
size_t utf8_length_from_utf32(const char32_t *src, size_t src_len);
/*
* Returns the unicode value at "index".
* Returns -1 when the index is invalid (equals to or more than "src_len").
* If returned value is positive, it is able to be converted to char32_t, which
* is unsigned. Then, if "next_index" is not NULL, the next index to be used is
* stored in "next_index". "next_index" can be NULL.
*/
int32_t utf32_at(const char *src, size_t src_len,
size_t index, size_t *next_index);
/*
* Stores a UTF-32 string converted from "src" in "dst", if "dst_length" is not
* large enough to store the string, the part of the "src" string is stored
* into "dst".
* Returns the size actually used for storing the string.
* "dst" is not null-terminated when dst_len is fully used (like strncpy).
*/
size_t utf8_to_utf32(const char* src, size_t src_len,
char32_t* dst, size_t dst_len);
/*
* Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not
* large enough to store the string, the part of the "src" string is stored
* into "dst" as much as possible. See the examples for more detail.
* Returns the size actually used for storing the string.
* dst" is not null-terminated when dst_len is fully used (like strncpy).
*
* Example 1
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" >= 7
* ->
* Returned value == 6
* "dst" becomes \xE3\x81\x82\xE3\x81\x84\0
* (note that "dst" is null-terminated)
*
* Example 2
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" == 5
* ->
* Returned value == 3
* "dst" becomes \xE3\x81\x82\0
* (note that "dst" is null-terminated, but \u3044 is not stored in "dst"
* since "dst" does not have enough size to store the character)
*
* Example 3
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" == 6
* ->
* Returned value == 6
* "dst" becomes \xE3\x81\x82\xE3\x81\x84
* (note that "dst" is NOT null-terminated, like strncpy)
*/
size_t utf32_to_utf8(const char32_t* src, size_t src_len,
char* dst, size_t dst_len);
size_t utf16_to_utf8(const char16_t* src, size_t src_len,
char* dst, size_t dst_len);
}
#include <string.h> // for strcmp
// ---------------------------------------------------------------------------
namespace android {
class String16;
class TextOutput;
//! This is a string holding UTF-8 characters. Does not allow the value more
@ -182,7 +76,7 @@ public:
size_t getUtf32Length() const;
int32_t getUtf32At(size_t index,
size_t *next_index) const;
size_t getUtf32(char32_t* dst, size_t dst_len) const;
void getUtf32(char32_t* dst) const;
inline String8& operator=(const String8& other);
inline String8& operator=(const char* other);

161
include/utils/Unicode.h Normal file
View File

@ -0,0 +1,161 @@
/*
* Copyright (C) 2005 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ANDROID_UNICODE_H
#define ANDROID_UNICODE_H
#include <sys/types.h>
#include <stdint.h>
extern "C" {
typedef uint32_t char32_t;
typedef uint16_t char16_t;
// Standard string functions on char16_t strings.
int strcmp16(const char16_t *, const char16_t *);
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n);
size_t strlen16(const char16_t *);
size_t strnlen16(const char16_t *, size_t);
char16_t *strcpy16(char16_t *, const char16_t *);
char16_t *strncpy16(char16_t *, const char16_t *, size_t);
// Version of comparison that supports embedded nulls.
// This is different than strncmp() because we don't stop
// at a nul character and consider the strings to be different
// if the lengths are different (thus we need to supply the
// lengths of both strings). This can also be used when
// your string is not nul-terminated as it will have the
// equivalent result as strcmp16 (unlike strncmp16).
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2);
// Version of strzcmp16 for comparing strings in different endianness.
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);
// Standard string functions on char32_t strings.
size_t strlen32(const char32_t *);
size_t strnlen32(const char32_t *, size_t);
/**
* Measure the length of a UTF-32 string in UTF-8. If the string is invalid
* such as containing a surrogate character, -1 will be returned.
*/
ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len);
/**
* Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not
* large enough to store the string, the part of the "src" string is stored
* into "dst" as much as possible. See the examples for more detail.
* Returns the size actually used for storing the string.
* dst" is not null-terminated when dst_len is fully used (like strncpy).
*
* Example 1
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" >= 7
* ->
* Returned value == 6
* "dst" becomes \xE3\x81\x82\xE3\x81\x84\0
* (note that "dst" is null-terminated)
*
* Example 2
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" == 5
* ->
* Returned value == 3
* "dst" becomes \xE3\x81\x82\0
* (note that "dst" is null-terminated, but \u3044 is not stored in "dst"
* since "dst" does not have enough size to store the character)
*
* Example 3
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
* "src_len" == 2
* "dst_len" == 6
* ->
* Returned value == 6
* "dst" becomes \xE3\x81\x82\xE3\x81\x84
* (note that "dst" is NOT null-terminated, like strncpy)
*/
void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst);
/**
* Returns the unicode value at "index".
* Returns -1 when the index is invalid (equals to or more than "src_len").
* If returned value is positive, it is able to be converted to char32_t, which
* is unsigned. Then, if "next_index" is not NULL, the next index to be used is
* stored in "next_index". "next_index" can be NULL.
*/
int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index);
/**
* Returns the UTF-8 length of UTF-16 string "src".
*/
ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len);
/**
* Converts a UTF-16 string to UTF-8. The destination buffer must be large
* enough to fit the UTF-16 as measured by utf16_to_utf8_length with an added
* NULL terminator.
*/
void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst);
/**
* Returns the length of "src" when "src" is valid UTF-8 string.
* Returns 0 if src is NULL or 0-length string. Returns -1 when the source
* is an invalid string.
*
* This function should be used to determine whether "src" is valid UTF-8
* characters with valid unicode codepoints. "src" must be null-terminated.
*
* If you are going to use other utf8_to_... functions defined in this header
* with string which may not be valid UTF-8 with valid codepoint (form 0 to
* 0x10FFFF), you should use this function before calling others, since the
* other functions do not check whether the string is valid UTF-8 or not.
*
* If you do not care whether "src" is valid UTF-8 or not, you should use
* strlen() as usual, which should be much faster.
*/
ssize_t utf8_length(const char *src);
/**
* Measure the length of a UTF-32 string.
*/
size_t utf8_to_utf32_length(const char *src, size_t src_len);
/**
* Stores a UTF-32 string converted from "src" in "dst". "dst" must be large
* enough to store the entire converted string as measured by
* utf8_to_utf32_length plus space for a NULL terminator.
*/
void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst);
/**
* Returns the UTF-16 length of UTF-8 string "src".
*/
ssize_t utf8_to_utf16_length(const uint8_t* src, size_t srcLen);
/**
* Convert UTF-8 to UTF-16 including surrogate pairs. The destination buffer
* must be large enough to hold the result as measured by utf8_to_utf16_length
* plus an added NULL terminator.
*/
void utf8_to_utf16(const uint8_t* src, size_t srcLen, char16_t* dst);
}
#endif

View File

@ -41,6 +41,7 @@ commonSources:= \
TextOutput.cpp \
Threads.cpp \
Timers.cpp \
Unicode.cpp \
VectorImpl.cpp \
ZipFileCRO.cpp \
ZipFileRO.cpp \

View File

@ -444,15 +444,51 @@ void ResStringPool::uninit()
}
}
#define DECODE_LENGTH(str, chrsz, len) \
len = *(str); \
if (*(str)&(1<<(chrsz*8-1))) { \
(str)++; \
len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \
} \
(str)++;
/**
* Strings in UTF-16 format have length indicated by a length encoded in the
* stored data. It is either 1 or 2 characters of length data. This allows a
* maximum length of 0x7FFFFFF (2147483647 bytes), but if you're storing that
* much data in a string, you're abusing them.
*
* If the high bit is set, then there are two characters or 4 bytes of length
* data encoded. In that case, drop the high bit of the first character and
* add it together with the next character.
*/
static inline size_t
decodeLength(const char16_t** str)
{
size_t len = **str;
if ((len & 0x8000) != 0) {
(*str)++;
len = ((len & 0x7FFF) << 16) | **str;
}
(*str)++;
return len;
}
const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
/**
* Strings in UTF-8 format have length indicated by a length encoded in the
* stored data. It is either 1 or 2 characters of length data. This allows a
* maximum length of 0x7FFF (32767 bytes), but you should consider storing
* text in another way if you're using that much data in a single string.
*
* If the high bit is set, then there are two characters or 2 bytes of length
* data encoded. In that case, drop the high bit of the first character and
* add it together with the next character.
*/
static inline size_t
decodeLength(const uint8_t** str)
{
size_t len = **str;
if ((len & 0x80) != 0) {
(*str)++;
len = ((len & 0x7F) << 8) | **str;
}
(*str)++;
return len;
}
const uint16_t* ResStringPool::stringAt(size_t idx, size_t* u16len) const
{
if (mError == NO_ERROR && idx < mHeader->stringCount) {
const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0;
@ -461,37 +497,51 @@ const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
if (!isUTF8) {
const char16_t* strings = (char16_t*)mStrings;
const char16_t* str = strings+off;
DECODE_LENGTH(str, sizeof(char16_t), *outLen)
if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) {
*u16len = decodeLength(&str);
if ((uint32_t)(str+*u16len-strings) < mStringPoolSize) {
return str;
} else {
LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
(int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize);
(int)idx, (int)(str+*u16len-strings), (int)mStringPoolSize);
}
} else {
const uint8_t* strings = (uint8_t*)mStrings;
const uint8_t* str = strings+off;
DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
size_t encLen;
DECODE_LENGTH(str, sizeof(uint8_t), encLen)
if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
const uint8_t* u8str = strings+off;
*u16len = decodeLength(&u8str);
size_t u8len = decodeLength(&u8str);
// encLen must be less than 0x7FFF due to encoding.
if ((uint32_t)(u8str+u8len-strings) < mStringPoolSize) {
AutoMutex lock(mDecodeLock);
if (mCache[idx] != NULL) {
return mCache[idx];
}
char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t));
ssize_t actualLen = utf8_to_utf16_length(u8str, u8len);
if (actualLen < 0 || (size_t)actualLen != *u16len) {
LOGW("Bad string block: string #%lld decoded length is not correct "
"%lld vs %llu\n",
(long long)idx, (long long)actualLen, (long long)*u16len);
return NULL;
}
char16_t *u16str = (char16_t *)calloc(*u16len+1, sizeof(char16_t));
if (!u16str) {
LOGW("No memory when trying to allocate decode cache for string #%d\n",
(int)idx);
return NULL;
}
const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str);
utf8_to_utf16(u8src, encLen, u16str, *outLen);
utf8_to_utf16(u8str, u8len, u16str);
mCache[idx] = u16str;
return u16str;
} else {
LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
(int)idx, (int)(str+encLen-strings), (int)mStringPoolSize);
LOGW("Bad string block: string #%lld extends to %lld, past end at %lld\n",
(long long)idx, (long long)(u8str+u8len-strings),
(long long)mStringPoolSize);
}
}
} else {
@ -512,9 +562,8 @@ const char* ResStringPool::string8At(size_t idx, size_t* outLen) const
if (isUTF8) {
const uint8_t* strings = (uint8_t*)mStrings;
const uint8_t* str = strings+off;
DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
size_t encLen;
DECODE_LENGTH(str, sizeof(uint8_t), encLen)
*outLen = decodeLength(&str);
size_t encLen = decodeLength(&str);
if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
return (const char*)str;
} else {

View File

@ -18,228 +18,17 @@
#include <utils/Debug.h>
#include <utils/Log.h>
#include <utils/Unicode.h>
#include <utils/String8.h>
#include <utils/TextOutput.h>
#include <utils/threads.h>
#include <private/utils/Static.h>
#ifdef HAVE_WINSOCK
# undef nhtol
# undef htonl
# undef nhtos
# undef htons
# ifdef HAVE_LITTLE_ENDIAN
# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
# define htonl(x) ntohl(x)
# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
# define htons(x) ntohs(x)
# else
# define ntohl(x) (x)
# define htonl(x) (x)
# define ntohs(x) (x)
# define htons(x) (x)
# endif
#else
# include <netinet/in.h>
#endif
#include <memory.h>
#include <stdio.h>
#include <ctype.h>
// ---------------------------------------------------------------------------
int strcmp16(const char16_t *s1, const char16_t *s2)
{
char16_t ch;
int d = 0;
while ( 1 ) {
d = (int)(ch = *s1++) - (int)*s2++;
if ( d || !ch )
break;
}
return d;
}
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
{
char16_t ch;
int d = 0;
while ( n-- ) {
d = (int)(ch = *s1++) - (int)*s2++;
if ( d || !ch )
break;
}
return d;
}
char16_t *strcpy16(char16_t *dst, const char16_t *src)
{
char16_t *q = dst;
const char16_t *p = src;
char16_t ch;
do {
*q++ = ch = *p++;
} while ( ch );
return dst;
}
size_t strlen16(const char16_t *s)
{
const char16_t *ss = s;
while ( *ss )
ss++;
return ss-s;
}
char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
{
char16_t *q = dst;
const char16_t *p = src;
char ch;
while (n) {
n--;
*q++ = ch = *p++;
if ( !ch )
break;
}
*q = 0;
return dst;
}
size_t strnlen16(const char16_t *s, size_t maxlen)
{
const char16_t *ss = s;
/* Important: the maxlen test must precede the reference through ss;
since the byte beyond the maximum may segfault */
while ((maxlen > 0) && *ss) {
ss++;
maxlen--;
}
return ss-s;
}
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
{
const char16_t* e1 = s1+n1;
const char16_t* e2 = s2+n2;
while (s1 < e1 && s2 < e2) {
const int d = (int)*s1++ - (int)*s2++;
if (d) {
return d;
}
}
return n1 < n2
? (0 - (int)*s2)
: (n1 > n2
? ((int)*s1 - 0)
: 0);
}
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
{
const char16_t* e1 = s1H+n1;
const char16_t* e2 = s2N+n2;
while (s1H < e1 && s2N < e2) {
const char16_t c2 = ntohs(*s2N);
const int d = (int)*s1H++ - (int)c2;
s2N++;
if (d) {
return d;
}
}
return n1 < n2
? (0 - (int)ntohs(*s2N))
: (n1 > n2
? ((int)*s1H - 0)
: 0);
}
static inline size_t
utf8_char_len(uint8_t ch)
{
return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
}
#define UTF8_SHIFT_AND_MASK(unicode, byte) (unicode)<<=6; (unicode) |= (0x3f & (byte));
static inline uint32_t
utf8_to_utf32(const uint8_t *src, size_t length)
{
uint32_t unicode;
switch (length)
{
case 1:
return src[0];
case 2:
unicode = src[0] & 0x1f;
UTF8_SHIFT_AND_MASK(unicode, src[1])
return unicode;
case 3:
unicode = src[0] & 0x0f;
UTF8_SHIFT_AND_MASK(unicode, src[1])
UTF8_SHIFT_AND_MASK(unicode, src[2])
return unicode;
case 4:
unicode = src[0] & 0x07;
UTF8_SHIFT_AND_MASK(unicode, src[1])
UTF8_SHIFT_AND_MASK(unicode, src[2])
UTF8_SHIFT_AND_MASK(unicode, src[3])
return unicode;
default:
return 0xffff;
}
//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
}
void
utf8_to_utf16(const uint8_t *src, size_t srcLen,
char16_t* dst, const size_t dstLen)
{
const uint8_t* const end = src + srcLen;
const char16_t* const dstEnd = dst + dstLen;
while (src < end && dst < dstEnd) {
size_t len = utf8_char_len(*src);
uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len);
// Convert the UTF32 codepoint to one or more UTF16 codepoints
if (codepoint <= 0xFFFF) {
// Single UTF16 character
*dst++ = (char16_t) codepoint;
} else {
// Multiple UTF16 characters with surrogates
codepoint = codepoint - 0x10000;
*dst++ = (char16_t) ((codepoint >> 10) + 0xD800);
*dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
}
src += len;
}
if (dst < dstEnd) {
*dst = 0;
}
}
// ---------------------------------------------------------------------------
namespace android {
@ -270,37 +59,33 @@ void terminate_string16()
// ---------------------------------------------------------------------------
static char16_t* allocFromUTF8(const char* in, size_t len)
static char16_t* allocFromUTF8(const char* u8str, size_t u8len)
{
if (len == 0) return getEmptyString();
size_t chars = 0;
const char* end = in+len;
const char* p = in;
while (p < end) {
chars++;
int utf8len = utf8_char_len(*p);
uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, utf8len);
if (codepoint > 0xFFFF) chars++; // this will be a surrogate pair in utf16
p += utf8len;
if (u8len == 0) return getEmptyString();
const uint8_t* u8cur = (const uint8_t*) u8str;
const ssize_t u16len = utf8_to_utf16_length(u8cur, u8len);
if (u16len < 0) {
return getEmptyString();
}
size_t bufSize = (chars+1)*sizeof(char16_t);
SharedBuffer* buf = SharedBuffer::alloc(bufSize);
const uint8_t* const u8end = u8cur + u8len;
SharedBuffer* buf = SharedBuffer::alloc(sizeof(char16_t)*(u16len+1));
if (buf) {
p = in;
char16_t* str = (char16_t*)buf->data();
utf8_to_utf16((const uint8_t*)p, len, str, bufSize);
u8cur = (const uint8_t*) u8str;
char16_t* u16str = (char16_t*)buf->data();
utf8_to_utf16(u8cur, u8len, u16str);
//printf("Created UTF-16 string from UTF-8 \"%s\":", in);
//printHexData(1, str, buf->size(), 16, 1);
//printf("\n");
return str;
return u16str;
}
return getEmptyString();
}

View File

@ -17,6 +17,8 @@
#include <utils/String8.h>
#include <utils/Log.h>
#include <utils/Unicode.h>
#include <utils/SharedBuffer.h>
#include <utils/String16.h>
#include <utils/TextOutput.h>
#include <utils/threads.h>
@ -34,94 +36,10 @@
namespace android {
static const char32_t kByteMask = 0x000000BF;
static const char32_t kByteMark = 0x00000080;
// Surrogates aren't valid for UTF-32 characters, so define some
// constants that will let us screen them out.
static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
// Mask used to set appropriate bits in first byte of UTF-8 sequence,
// indexed by number of bytes in the sequence.
// 0xxxxxxx
// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
// 110yyyyx 10xxxxxx
// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
// 1110yyyy 10yxxxxx 10xxxxxx
// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
static const char32_t kFirstByteMark[] = {
0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
};
// Separator used by resource paths. This is not platform dependent contrary
// to OS_PATH_SEPARATOR.
#define RES_PATH_SEPARATOR '/'
// Return number of utf8 bytes required for the character.
static size_t utf32_to_utf8_bytes(char32_t srcChar)
{
size_t bytesToWrite;
// Figure out how many bytes the result will require.
if (srcChar < 0x00000080)
{
bytesToWrite = 1;
}
else if (srcChar < 0x00000800)
{
bytesToWrite = 2;
}
else if (srcChar < 0x00010000)
{
if ((srcChar < kUnicodeSurrogateStart)
|| (srcChar > kUnicodeSurrogateEnd))
{
bytesToWrite = 3;
}
else
{
// Surrogates are invalid UTF-32 characters.
return 0;
}
}
// Max code point for Unicode is 0x0010FFFF.
else if (srcChar <= kUnicodeMaxCodepoint)
{
bytesToWrite = 4;
}
else
{
// Invalid UTF-32 character.
return 0;
}
return bytesToWrite;
}
// Write out the source character to <dstP>.
static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
{
dstP += bytes;
switch (bytes)
{ /* note: everything falls through. */
case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
}
}
// ---------------------------------------------------------------------------
static SharedBuffer* gEmptyStringBuf = NULL;
static char* gEmptyString = NULL;
@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len)
return getEmptyString();
}
template<typename T, typename L>
static char* allocFromUTF16OrUTF32(const T* in, L len)
{
if (len == 0) return getEmptyString();
size_t bytes = 0;
const T* end = in+len;
const T* p = in;
while (p < end) {
bytes += utf32_to_utf8_bytes(*p);
p++;
}
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
LOG_ASSERT(buf, "Unable to allocate shared buffer");
if (buf) {
p = in;
char* str = (char*)buf->data();
char* d = str;
while (p < end) {
const T c = *p++;
size_t len = utf32_to_utf8_bytes(c);
utf32_to_utf8((uint8_t*)d, c, len);
d += len;
}
*d = 0;
return str;
}
return getEmptyString();
}
static char* allocFromUTF16(const char16_t* in, size_t len)
{
if (len == 0) return getEmptyString();
const size_t bytes = utf8_length_from_utf16(in, len);
const ssize_t bytes = utf16_to_utf8_length(in, len);
if (bytes < 0) {
return getEmptyString();
}
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
LOG_ASSERT(buf, "Unable to allocate shared buffer");
if (buf) {
char* str = (char*)buf->data();
utf16_to_utf8(in, len, str, bytes+1);
return str;
if (!buf) {
return getEmptyString();
}
return getEmptyString();
char* str = (char*)buf->data();
utf16_to_utf8(in, len, str);
return str;
}
static char* allocFromUTF32(const char32_t* in, size_t len)
{
return allocFromUTF16OrUTF32<char32_t, size_t>(in, len);
if (len == 0) {
return getEmptyString();
}
const ssize_t bytes = utf32_to_utf8_length(in, len);
if (bytes < 0) {
return getEmptyString();
}
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
LOG_ASSERT(buf, "Unable to allocate shared buffer");
if (!buf) {
return getEmptyString();
}
char* str = (char*) buf->data();
utf32_to_utf8(in, len, str);
return str;
}
// ---------------------------------------------------------------------------
@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length)
size_t String8::getUtf32Length() const
{
return utf32_length(mString, length());
return utf8_to_utf32_length(mString, length());
}
int32_t String8::getUtf32At(size_t index, size_t *next_index) const
{
return utf32_at(mString, length(), index, next_index);
return utf32_from_utf8_at(mString, length(), index, next_index);
}
size_t String8::getUtf32(char32_t* dst, size_t dst_len) const
void String8::getUtf32(char32_t* dst) const
{
return utf8_to_utf32(mString, length(), dst, dst_len);
utf8_to_utf32(mString, length(), dst);
}
TextOutput& operator<<(TextOutput& to, const String8& val)
@ -705,241 +608,3 @@ String8& String8::convertToResPath()
}
}; // namespace android
// ---------------------------------------------------------------------------
size_t strlen32(const char32_t *s)
{
const char32_t *ss = s;
while ( *ss )
ss++;
return ss-s;
}
size_t strnlen32(const char32_t *s, size_t maxlen)
{
const char32_t *ss = s;
while ((maxlen > 0) && *ss) {
ss++;
maxlen--;
}
return ss-s;
}
size_t utf8_length(const char *src)
{
const char *cur = src;
size_t ret = 0;
while (*cur != '\0') {
const char first_char = *cur++;
if ((first_char & 0x80) == 0) { // ASCII
ret += 1;
continue;
}
// (UTF-8's character must not be like 10xxxxxx,
// but 110xxxxx, 1110xxxx, ... or 1111110x)
if ((first_char & 0x40) == 0) {
return 0;
}
int32_t mask, to_ignore_mask;
size_t num_to_read = 0;
char32_t utf32 = 0;
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
num_to_read < 5 && (first_char & mask);
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
return 0;
}
// 0x3F == 00111111
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
}
// "first_char" must be (110xxxxx - 11110xxx)
if (num_to_read == 5) {
return 0;
}
to_ignore_mask |= mask;
utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
if (utf32 > android::kUnicodeMaxCodepoint) {
return 0;
}
ret += num_to_read;
}
return ret;
}
size_t utf32_length(const char *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return 0;
}
size_t ret = 0;
const char* cur;
const char* end;
size_t num_to_skip;
for (cur = src, end = src + src_len, num_to_skip = 1;
cur < end;
cur += num_to_skip, ret++) {
const char first_char = *cur;
num_to_skip = 1;
if ((first_char & 0x80) == 0) { // ASCII
continue;
}
int32_t mask;
for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
}
}
return ret;
}
size_t utf8_length_from_utf32(const char32_t *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return 0;
}
size_t ret = 0;
const char32_t *end = src + src_len;
while (src < end) {
ret += android::utf32_to_utf8_bytes(*src++);
}
return ret;
}
size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return 0;
}
size_t ret = 0;
const char16_t* const end = src + src_len;
while (src < end) {
if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
&& (*++src & 0xFC00) == 0xDC00) {
// surrogate pairs are always 4 bytes.
ret += 4;
src++;
} else {
ret += android::utf32_to_utf8_bytes((char32_t) *src++);
}
}
return ret;
}
static int32_t utf32_at_internal(const char* cur, size_t *num_read)
{
const char first_char = *cur;
if ((first_char & 0x80) == 0) { // ASCII
*num_read = 1;
return *cur;
}
cur++;
char32_t mask, to_ignore_mask;
size_t num_to_read = 0;
char32_t utf32 = first_char;
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
(first_char & mask);
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
// 0x3F == 00111111
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
}
to_ignore_mask |= mask;
utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
*num_read = num_to_read;
return static_cast<int32_t>(utf32);
}
int32_t utf32_at(const char *src, size_t src_len,
size_t index, size_t *next_index)
{
if (index >= src_len) {
return -1;
}
size_t dummy_index;
if (next_index == NULL) {
next_index = &dummy_index;
}
size_t num_read;
int32_t ret = utf32_at_internal(src + index, &num_read);
if (ret >= 0) {
*next_index = index + num_read;
}
return ret;
}
size_t utf8_to_utf32(const char* src, size_t src_len,
char32_t* dst, size_t dst_len)
{
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
return 0;
}
const char* cur = src;
const char* end = src + src_len;
char32_t* cur_utf32 = dst;
const char32_t* end_utf32 = dst + dst_len;
while (cur_utf32 < end_utf32 && cur < end) {
size_t num_read;
*cur_utf32++ =
static_cast<char32_t>(utf32_at_internal(cur, &num_read));
cur += num_read;
}
if (cur_utf32 < end_utf32) {
*cur_utf32 = 0;
}
return static_cast<size_t>(cur_utf32 - dst);
}
size_t utf32_to_utf8(const char32_t* src, size_t src_len,
char* dst, size_t dst_len)
{
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
return 0;
}
const char32_t *cur_utf32 = src;
const char32_t *end_utf32 = src + src_len;
char *cur = dst;
const char *end = dst + dst_len;
while (cur_utf32 < end_utf32 && cur < end) {
size_t len = android::utf32_to_utf8_bytes(*cur_utf32);
android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len);
cur += len;
}
if (cur < end) {
*cur = '\0';
}
return cur - dst;
}
size_t utf16_to_utf8(const char16_t* src, size_t src_len,
char* dst, size_t dst_len)
{
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
return 0;
}
const char16_t* cur_utf16 = src;
const char16_t* const end_utf16 = src + src_len;
char *cur = dst;
const char* const end = dst + dst_len;
while (cur_utf16 < end_utf16 && cur < end) {
char32_t utf32;
// surrogate pairs
if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
utf32 = (*cur_utf16++ - 0xD800) << 10;
utf32 |= *cur_utf16++ - 0xDC00;
utf32 += 0x10000;
} else {
utf32 = (char32_t) *cur_utf16++;
}
size_t len = android::utf32_to_utf8_bytes(utf32);
android::utf32_to_utf8((uint8_t*)cur, utf32, len);
cur += len;
}
if (cur < end) {
*cur = '\0';
}
return cur - dst;
}

575
libs/utils/Unicode.cpp Normal file
View File

@ -0,0 +1,575 @@
/*
* Copyright (C) 2005 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <utils/Unicode.h>
#include <stddef.h>
#ifdef HAVE_WINSOCK
# undef nhtol
# undef htonl
# undef nhtos
# undef htons
# ifdef HAVE_LITTLE_ENDIAN
# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
# define htonl(x) ntohl(x)
# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
# define htons(x) ntohs(x)
# else
# define ntohl(x) (x)
# define htonl(x) (x)
# define ntohs(x) (x)
# define htons(x) (x)
# endif
#else
# include <netinet/in.h>
#endif
extern "C" {
static const char32_t kByteMask = 0x000000BF;
static const char32_t kByteMark = 0x00000080;
// Surrogates aren't valid for UTF-32 characters, so define some
// constants that will let us screen them out.
static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
// Mask used to set appropriate bits in first byte of UTF-8 sequence,
// indexed by number of bytes in the sequence.
// 0xxxxxxx
// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
// 110yyyyx 10xxxxxx
// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
// 1110yyyy 10yxxxxx 10xxxxxx
// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
static const char32_t kFirstByteMark[] = {
0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
};
// --------------------------------------------------------------------------
// UTF-32
// --------------------------------------------------------------------------
/**
* Return number of UTF-8 bytes required for the character. If the character
* is invalid, return size of 0.
*/
static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
{
// Figure out how many bytes the result will require.
if (srcChar < 0x00000080) {
return 1;
} else if (srcChar < 0x00000800) {
return 2;
} else if (srcChar < 0x00010000) {
if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) {
return 3;
} else {
// Surrogates are invalid UTF-32 characters.
return 0;
}
}
// Max code point for Unicode is 0x0010FFFF.
else if (srcChar <= kUnicodeMaxCodepoint) {
return 4;
} else {
// Invalid UTF-32 character.
return 0;
}
}
// Write out the source character to <dstP>.
static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
{
dstP += bytes;
switch (bytes)
{ /* note: everything falls through. */
case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
}
}
size_t strlen32(const char32_t *s)
{
const char32_t *ss = s;
while ( *ss )
ss++;
return ss-s;
}
size_t strnlen32(const char32_t *s, size_t maxlen)
{
const char32_t *ss = s;
while ((maxlen > 0) && *ss) {
ss++;
maxlen--;
}
return ss-s;
}
static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
{
const char first_char = *cur;
if ((first_char & 0x80) == 0) { // ASCII
*num_read = 1;
return *cur;
}
cur++;
char32_t mask, to_ignore_mask;
size_t num_to_read = 0;
char32_t utf32 = first_char;
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
(first_char & mask);
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
// 0x3F == 00111111
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
}
to_ignore_mask |= mask;
utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
*num_read = num_to_read;
return static_cast<int32_t>(utf32);
}
int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index)
{
if (index >= src_len) {
return -1;
}
size_t dummy_index;
if (next_index == NULL) {
next_index = &dummy_index;
}
size_t num_read;
int32_t ret = utf32_at_internal(src + index, &num_read);
if (ret >= 0) {
*next_index = index + num_read;
}
return ret;
}
ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return -1;
}
size_t ret = 0;
const char32_t *end = src + src_len;
while (src < end) {
ret += utf32_codepoint_utf8_length(*src++);
}
return ret;
}
void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
{
if (src == NULL || src_len == 0 || dst == NULL) {
return;
}
const char32_t *cur_utf32 = src;
const char32_t *end_utf32 = src + src_len;
char *cur = dst;
while (cur_utf32 < end_utf32) {
size_t len = utf32_codepoint_utf8_length(*cur_utf32);
utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len);
cur += len;
}
*cur = '\0';
}
// --------------------------------------------------------------------------
// UTF-16
// --------------------------------------------------------------------------
int strcmp16(const char16_t *s1, const char16_t *s2)
{
char16_t ch;
int d = 0;
while ( 1 ) {
d = (int)(ch = *s1++) - (int)*s2++;
if ( d || !ch )
break;
}
return d;
}
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
{
char16_t ch;
int d = 0;
while ( n-- ) {
d = (int)(ch = *s1++) - (int)*s2++;
if ( d || !ch )
break;
}
return d;
}
char16_t *strcpy16(char16_t *dst, const char16_t *src)
{
char16_t *q = dst;
const char16_t *p = src;
char16_t ch;
do {
*q++ = ch = *p++;
} while ( ch );
return dst;
}
size_t strlen16(const char16_t *s)
{
const char16_t *ss = s;
while ( *ss )
ss++;
return ss-s;
}
char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
{
char16_t *q = dst;
const char16_t *p = src;
char ch;
while (n) {
n--;
*q++ = ch = *p++;
if ( !ch )
break;
}
*q = 0;
return dst;
}
size_t strnlen16(const char16_t *s, size_t maxlen)
{
const char16_t *ss = s;
/* Important: the maxlen test must precede the reference through ss;
since the byte beyond the maximum may segfault */
while ((maxlen > 0) && *ss) {
ss++;
maxlen--;
}
return ss-s;
}
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
{
const char16_t* e1 = s1+n1;
const char16_t* e2 = s2+n2;
while (s1 < e1 && s2 < e2) {
const int d = (int)*s1++ - (int)*s2++;
if (d) {
return d;
}
}
return n1 < n2
? (0 - (int)*s2)
: (n1 > n2
? ((int)*s1 - 0)
: 0);
}
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
{
const char16_t* e1 = s1H+n1;
const char16_t* e2 = s2N+n2;
while (s1H < e1 && s2N < e2) {
const char16_t c2 = ntohs(*s2N);
const int d = (int)*s1H++ - (int)c2;
s2N++;
if (d) {
return d;
}
}
return n1 < n2
? (0 - (int)ntohs(*s2N))
: (n1 > n2
? ((int)*s1H - 0)
: 0);
}
void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
{
if (src == NULL || src_len == 0 || dst == NULL) {
return;
}
const char16_t* cur_utf16 = src;
const char16_t* const end_utf16 = src + src_len;
char *cur = dst;
while (cur_utf16 < end_utf16) {
char32_t utf32;
// surrogate pairs
if ((*cur_utf16 & 0xFC00) == 0xD800) {
utf32 = (*cur_utf16++ - 0xD800) << 10;
utf32 |= *cur_utf16++ - 0xDC00;
utf32 += 0x10000;
} else {
utf32 = (char32_t) *cur_utf16++;
}
const size_t len = utf32_codepoint_utf8_length(utf32);
utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
cur += len;
}
*cur = '\0';
}
// --------------------------------------------------------------------------
// UTF-8
// --------------------------------------------------------------------------
ssize_t utf8_length(const char *src)
{
const char *cur = src;
size_t ret = 0;
while (*cur != '\0') {
const char first_char = *cur++;
if ((first_char & 0x80) == 0) { // ASCII
ret += 1;
continue;
}
// (UTF-8's character must not be like 10xxxxxx,
// but 110xxxxx, 1110xxxx, ... or 1111110x)
if ((first_char & 0x40) == 0) {
return -1;
}
int32_t mask, to_ignore_mask;
size_t num_to_read = 0;
char32_t utf32 = 0;
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
num_to_read < 5 && (first_char & mask);
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
return -1;
}
// 0x3F == 00111111
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
}
// "first_char" must be (110xxxxx - 11110xxx)
if (num_to_read == 5) {
return -1;
}
to_ignore_mask |= mask;
utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
if (utf32 > kUnicodeMaxCodepoint) {
return -1;
}
ret += num_to_read;
}
return ret;
}
ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return -1;
}
size_t ret = 0;
const char16_t* const end = src + src_len;
while (src < end) {
if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
&& (*++src & 0xFC00) == 0xDC00) {
// surrogate pairs are always 4 bytes.
ret += 4;
src++;
} else {
ret += utf32_codepoint_utf8_length((char32_t) *src++);
}
}
return ret;
}
/**
* Returns 1-4 based on the number of leading bits.
*
* 1111 -> 4
* 1110 -> 3
* 110x -> 2
* 10xx -> 1
* 0xxx -> 1
*/
static inline size_t utf8_codepoint_len(uint8_t ch)
{
return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
}
static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
{
*codePoint <<= 6;
*codePoint |= 0x3F & byte;
}
size_t utf8_to_utf32_length(const char *src, size_t src_len)
{
if (src == NULL || src_len == 0) {
return 0;
}
size_t ret = 0;
const char* cur;
const char* end;
size_t num_to_skip;
for (cur = src, end = src + src_len, num_to_skip = 1;
cur < end;
cur += num_to_skip, ret++) {
const char first_char = *cur;
num_to_skip = 1;
if ((first_char & 0x80) == 0) { // ASCII
continue;
}
int32_t mask;
for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
}
}
return ret;
}
void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
{
if (src == NULL || src_len == 0 || dst == NULL) {
return;
}
const char* cur = src;
const char* const end = src + src_len;
char32_t* cur_utf32 = dst;
while (cur < end) {
size_t num_read;
*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
cur += num_read;
}
*cur_utf32 = 0;
}
static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
{
uint32_t unicode;
switch (length)
{
case 1:
return src[0];
case 2:
unicode = src[0] & 0x1f;
utf8_shift_and_mask(&unicode, src[1]);
return unicode;
case 3:
unicode = src[0] & 0x0f;
utf8_shift_and_mask(&unicode, src[1]);
utf8_shift_and_mask(&unicode, src[2]);
return unicode;
case 4:
unicode = src[0] & 0x07;
utf8_shift_and_mask(&unicode, src[1]);
utf8_shift_and_mask(&unicode, src[2]);
utf8_shift_and_mask(&unicode, src[3]);
return unicode;
default:
return 0xffff;
}
//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
}
ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
{
const uint8_t* const u8end = u8str + u8len;
const uint8_t* u8cur = u8str;
/* Validate that the UTF-8 is the correct len */
size_t u16measuredLen = 0;
while (u8cur < u8end) {
u16measuredLen++;
int u8charLen = utf8_codepoint_len(*u8cur);
uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
u8cur += u8charLen;
}
/**
* Make sure that we ended where we thought we would and the output UTF-16
* will be exactly how long we were told it would be.
*/
if (u8cur != u8end) {
return -1;
}
return u16measuredLen;
}
/**
* Convert a UTF-8 string to UTF-16. The destination UTF-16 buffer must have
* space for NULL at the end.
*/
void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str)
{
const uint8_t* const u8end = u8str + u8len;
const uint8_t* u8cur = u8str;
char16_t* u16cur = u16str;
while (u8cur < u8end) {
size_t u8len = utf8_codepoint_len(*u8cur);
uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
// Convert the UTF32 codepoint to one or more UTF16 codepoints
if (codepoint <= 0xFFFF) {
// Single UTF16 character
*u16cur++ = (char16_t) codepoint;
} else {
// Multiple UTF16 characters with surrogates
codepoint = codepoint - 0x10000;
*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
}
u8cur += u8len;
}
*u16cur = 0;
}
}

View File

@ -8,7 +8,8 @@ ifneq ($(TARGET_SIMULATOR),true)
test_src_files := \
ObbFile_test.cpp \
Looper_test.cpp \
String8_test.cpp
String8_test.cpp \
Unicode_test.cpp
shared_libraries := \
libz \

View File

@ -0,0 +1,115 @@
/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "Unicode_test"
#include <utils/Log.h>
#include <utils/Unicode.h>
#include <gtest/gtest.h>
namespace android {
class UnicodeTest : public testing::Test {
protected:
virtual void SetUp() {
}
virtual void TearDown() {
}
};
TEST_F(UnicodeTest, UTF8toUTF16ZeroLength) {
ssize_t measured;
const uint8_t str[] = { };
measured = utf8_to_utf16_length(str, 0);
EXPECT_EQ(0, measured)
<< "Zero length input should return zero length output.";
}
TEST_F(UnicodeTest, UTF8toUTF16ASCIILength) {
ssize_t measured;
// U+0030 or ASCII '0'
const uint8_t str[] = { 0x30 };
measured = utf8_to_utf16_length(str, sizeof(str));
EXPECT_EQ(1, measured)
<< "ASCII glyphs should have a length of 1 char16_t";
}
TEST_F(UnicodeTest, UTF8toUTF16Plane1Length) {
ssize_t measured;
// U+2323 SMILE
const uint8_t str[] = { 0xE2, 0x8C, 0xA3 };
measured = utf8_to_utf16_length(str, sizeof(str));
EXPECT_EQ(1, measured)
<< "Plane 1 glyphs should have a length of 1 char16_t";
}
TEST_F(UnicodeTest, UTF8toUTF16SurrogateLength) {
ssize_t measured;
// U+10000
const uint8_t str[] = { 0xF0, 0x90, 0x80, 0x80 };
measured = utf8_to_utf16_length(str, sizeof(str));
EXPECT_EQ(2, measured)
<< "Surrogate pairs should have a length of 2 char16_t";
}
TEST_F(UnicodeTest, UTF8toUTF16TruncatedUTF8) {
ssize_t measured;
// Truncated U+2323 SMILE
// U+2323 SMILE
const uint8_t str[] = { 0xE2, 0x8C };
measured = utf8_to_utf16_length(str, sizeof(str));
EXPECT_EQ(-1, measured)
<< "Truncated UTF-8 should return -1 to indicate invalid";
}
TEST_F(UnicodeTest, UTF8toUTF16Normal) {
const uint8_t str[] = {
0x30, // U+0030, 1 UTF-16 character
0xC4, 0x80, // U+0100, 1 UTF-16 character
0xE2, 0x8C, 0xA3, // U+2323, 1 UTF-16 character
0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character
};
char16_t output[1 + 1 + 1 + 2 + 1]; // Room for NULL
utf8_to_utf16(str, sizeof(str), output);
EXPECT_EQ(0x0030, output[0])
<< "should be U+0030";
EXPECT_EQ(0x0100, output[1])
<< "should be U+0100";
EXPECT_EQ(0x2323, output[2])
<< "should be U+2323";
EXPECT_EQ(0xD800, output[3])
<< "should be first half of surrogate U+10000";
EXPECT_EQ(0xDC00, output[4])
<< "should be second half of surrogate U+10000";
EXPECT_EQ(NULL, output[5])
<< "should be NULL terminated";
}
}