Merge "Split UTF functions from String8/16"
This commit is contained in:
commit
7986fe5035
|
@ -19,39 +19,12 @@
|
|||
|
||||
#include <utils/Errors.h>
|
||||
#include <utils/SharedBuffer.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <utils/Unicode.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef uint16_t char16_t;
|
||||
|
||||
// Standard string functions on char16 strings.
|
||||
int strcmp16(const char16_t *, const char16_t *);
|
||||
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n);
|
||||
size_t strlen16(const char16_t *);
|
||||
size_t strnlen16(const char16_t *, size_t);
|
||||
char16_t *strcpy16(char16_t *, const char16_t *);
|
||||
char16_t *strncpy16(char16_t *, const char16_t *, size_t);
|
||||
|
||||
// Version of comparison that supports embedded nulls.
|
||||
// This is different than strncmp() because we don't stop
|
||||
// at a nul character and consider the strings to be different
|
||||
// if the lengths are different (thus we need to supply the
|
||||
// lengths of both strings). This can also be used when
|
||||
// your string is not nul-terminated as it will have the
|
||||
// equivalent result as strcmp16 (unlike strncmp16).
|
||||
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2);
|
||||
|
||||
// Version of strzcmp16 for comparing strings in different endianness.
|
||||
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);
|
||||
|
||||
// Convert UTF-8 to UTF-16 including surrogate pairs
|
||||
void utf8_to_utf16(const uint8_t *src, size_t srcLen, char16_t* dst, const size_t dstLen);
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
@ -18,122 +18,16 @@
|
|||
#define ANDROID_STRING8_H
|
||||
|
||||
#include <utils/Errors.h>
|
||||
#include <utils/SharedBuffer.h>
|
||||
#include <utils/Unicode.h>
|
||||
|
||||
// Need this for the char16_t type; String8.h should not
|
||||
// be depedent on the String16 class.
|
||||
#include <utils/String16.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef uint32_t char32_t;
|
||||
|
||||
size_t strlen32(const char32_t *);
|
||||
size_t strnlen32(const char32_t *, size_t);
|
||||
|
||||
/*
|
||||
* Returns the length of "src" when "src" is valid UTF-8 string.
|
||||
* Returns 0 if src is NULL, 0-length string or non UTF-8 string.
|
||||
* This function should be used to determine whether "src" is valid UTF-8
|
||||
* characters with valid unicode codepoints. "src" must be null-terminated.
|
||||
*
|
||||
* If you are going to use other GetUtf... functions defined in this header
|
||||
* with string which may not be valid UTF-8 with valid codepoint (form 0 to
|
||||
* 0x10FFFF), you should use this function before calling others, since the
|
||||
* other functions do not check whether the string is valid UTF-8 or not.
|
||||
*
|
||||
* If you do not care whether "src" is valid UTF-8 or not, you should use
|
||||
* strlen() as usual, which should be much faster.
|
||||
*/
|
||||
size_t utf8_length(const char *src);
|
||||
|
||||
/*
|
||||
* Returns the UTF-32 length of "src".
|
||||
*/
|
||||
size_t utf32_length(const char *src, size_t src_len);
|
||||
|
||||
/*
|
||||
* Returns the UTF-8 length of "src".
|
||||
*/
|
||||
size_t utf8_length_from_utf16(const char16_t *src, size_t src_len);
|
||||
|
||||
/*
|
||||
* Returns the UTF-8 length of "src".
|
||||
*/
|
||||
size_t utf8_length_from_utf32(const char32_t *src, size_t src_len);
|
||||
|
||||
/*
|
||||
* Returns the unicode value at "index".
|
||||
* Returns -1 when the index is invalid (equals to or more than "src_len").
|
||||
* If returned value is positive, it is able to be converted to char32_t, which
|
||||
* is unsigned. Then, if "next_index" is not NULL, the next index to be used is
|
||||
* stored in "next_index". "next_index" can be NULL.
|
||||
*/
|
||||
int32_t utf32_at(const char *src, size_t src_len,
|
||||
size_t index, size_t *next_index);
|
||||
|
||||
/*
|
||||
* Stores a UTF-32 string converted from "src" in "dst", if "dst_length" is not
|
||||
* large enough to store the string, the part of the "src" string is stored
|
||||
* into "dst".
|
||||
* Returns the size actually used for storing the string.
|
||||
* "dst" is not null-terminated when dst_len is fully used (like strncpy).
|
||||
*/
|
||||
size_t utf8_to_utf32(const char* src, size_t src_len,
|
||||
char32_t* dst, size_t dst_len);
|
||||
|
||||
/*
|
||||
* Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not
|
||||
* large enough to store the string, the part of the "src" string is stored
|
||||
* into "dst" as much as possible. See the examples for more detail.
|
||||
* Returns the size actually used for storing the string.
|
||||
* dst" is not null-terminated when dst_len is fully used (like strncpy).
|
||||
*
|
||||
* Example 1
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" >= 7
|
||||
* ->
|
||||
* Returned value == 6
|
||||
* "dst" becomes \xE3\x81\x82\xE3\x81\x84\0
|
||||
* (note that "dst" is null-terminated)
|
||||
*
|
||||
* Example 2
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" == 5
|
||||
* ->
|
||||
* Returned value == 3
|
||||
* "dst" becomes \xE3\x81\x82\0
|
||||
* (note that "dst" is null-terminated, but \u3044 is not stored in "dst"
|
||||
* since "dst" does not have enough size to store the character)
|
||||
*
|
||||
* Example 3
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" == 6
|
||||
* ->
|
||||
* Returned value == 6
|
||||
* "dst" becomes \xE3\x81\x82\xE3\x81\x84
|
||||
* (note that "dst" is NOT null-terminated, like strncpy)
|
||||
*/
|
||||
size_t utf32_to_utf8(const char32_t* src, size_t src_len,
|
||||
char* dst, size_t dst_len);
|
||||
|
||||
size_t utf16_to_utf8(const char16_t* src, size_t src_len,
|
||||
char* dst, size_t dst_len);
|
||||
|
||||
}
|
||||
#include <string.h> // for strcmp
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
namespace android {
|
||||
|
||||
class String16;
|
||||
class TextOutput;
|
||||
|
||||
//! This is a string holding UTF-8 characters. Does not allow the value more
|
||||
|
@ -182,7 +76,7 @@ public:
|
|||
size_t getUtf32Length() const;
|
||||
int32_t getUtf32At(size_t index,
|
||||
size_t *next_index) const;
|
||||
size_t getUtf32(char32_t* dst, size_t dst_len) const;
|
||||
void getUtf32(char32_t* dst) const;
|
||||
|
||||
inline String8& operator=(const String8& other);
|
||||
inline String8& operator=(const char* other);
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright (C) 2005 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef ANDROID_UNICODE_H
|
||||
#define ANDROID_UNICODE_H
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef uint32_t char32_t;
|
||||
typedef uint16_t char16_t;
|
||||
|
||||
// Standard string functions on char16_t strings.
|
||||
int strcmp16(const char16_t *, const char16_t *);
|
||||
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n);
|
||||
size_t strlen16(const char16_t *);
|
||||
size_t strnlen16(const char16_t *, size_t);
|
||||
char16_t *strcpy16(char16_t *, const char16_t *);
|
||||
char16_t *strncpy16(char16_t *, const char16_t *, size_t);
|
||||
|
||||
// Version of comparison that supports embedded nulls.
|
||||
// This is different than strncmp() because we don't stop
|
||||
// at a nul character and consider the strings to be different
|
||||
// if the lengths are different (thus we need to supply the
|
||||
// lengths of both strings). This can also be used when
|
||||
// your string is not nul-terminated as it will have the
|
||||
// equivalent result as strcmp16 (unlike strncmp16).
|
||||
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2);
|
||||
|
||||
// Version of strzcmp16 for comparing strings in different endianness.
|
||||
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);
|
||||
|
||||
// Standard string functions on char32_t strings.
|
||||
size_t strlen32(const char32_t *);
|
||||
size_t strnlen32(const char32_t *, size_t);
|
||||
|
||||
/**
|
||||
* Measure the length of a UTF-32 string in UTF-8. If the string is invalid
|
||||
* such as containing a surrogate character, -1 will be returned.
|
||||
*/
|
||||
ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len);
|
||||
|
||||
/**
|
||||
* Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not
|
||||
* large enough to store the string, the part of the "src" string is stored
|
||||
* into "dst" as much as possible. See the examples for more detail.
|
||||
* Returns the size actually used for storing the string.
|
||||
* dst" is not null-terminated when dst_len is fully used (like strncpy).
|
||||
*
|
||||
* Example 1
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" >= 7
|
||||
* ->
|
||||
* Returned value == 6
|
||||
* "dst" becomes \xE3\x81\x82\xE3\x81\x84\0
|
||||
* (note that "dst" is null-terminated)
|
||||
*
|
||||
* Example 2
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" == 5
|
||||
* ->
|
||||
* Returned value == 3
|
||||
* "dst" becomes \xE3\x81\x82\0
|
||||
* (note that "dst" is null-terminated, but \u3044 is not stored in "dst"
|
||||
* since "dst" does not have enough size to store the character)
|
||||
*
|
||||
* Example 3
|
||||
* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
|
||||
* "src_len" == 2
|
||||
* "dst_len" == 6
|
||||
* ->
|
||||
* Returned value == 6
|
||||
* "dst" becomes \xE3\x81\x82\xE3\x81\x84
|
||||
* (note that "dst" is NOT null-terminated, like strncpy)
|
||||
*/
|
||||
void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst);
|
||||
|
||||
/**
|
||||
* Returns the unicode value at "index".
|
||||
* Returns -1 when the index is invalid (equals to or more than "src_len").
|
||||
* If returned value is positive, it is able to be converted to char32_t, which
|
||||
* is unsigned. Then, if "next_index" is not NULL, the next index to be used is
|
||||
* stored in "next_index". "next_index" can be NULL.
|
||||
*/
|
||||
int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the UTF-8 length of UTF-16 string "src".
|
||||
*/
|
||||
ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len);
|
||||
|
||||
/**
|
||||
* Converts a UTF-16 string to UTF-8. The destination buffer must be large
|
||||
* enough to fit the UTF-16 as measured by utf16_to_utf8_length with an added
|
||||
* NULL terminator.
|
||||
*/
|
||||
void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst);
|
||||
|
||||
/**
|
||||
* Returns the length of "src" when "src" is valid UTF-8 string.
|
||||
* Returns 0 if src is NULL or 0-length string. Returns -1 when the source
|
||||
* is an invalid string.
|
||||
*
|
||||
* This function should be used to determine whether "src" is valid UTF-8
|
||||
* characters with valid unicode codepoints. "src" must be null-terminated.
|
||||
*
|
||||
* If you are going to use other utf8_to_... functions defined in this header
|
||||
* with string which may not be valid UTF-8 with valid codepoint (form 0 to
|
||||
* 0x10FFFF), you should use this function before calling others, since the
|
||||
* other functions do not check whether the string is valid UTF-8 or not.
|
||||
*
|
||||
* If you do not care whether "src" is valid UTF-8 or not, you should use
|
||||
* strlen() as usual, which should be much faster.
|
||||
*/
|
||||
ssize_t utf8_length(const char *src);
|
||||
|
||||
/**
|
||||
* Measure the length of a UTF-32 string.
|
||||
*/
|
||||
size_t utf8_to_utf32_length(const char *src, size_t src_len);
|
||||
|
||||
/**
|
||||
* Stores a UTF-32 string converted from "src" in "dst". "dst" must be large
|
||||
* enough to store the entire converted string as measured by
|
||||
* utf8_to_utf32_length plus space for a NULL terminator.
|
||||
*/
|
||||
void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst);
|
||||
|
||||
/**
|
||||
* Returns the UTF-16 length of UTF-8 string "src".
|
||||
*/
|
||||
ssize_t utf8_to_utf16_length(const uint8_t* src, size_t srcLen);
|
||||
|
||||
/**
|
||||
* Convert UTF-8 to UTF-16 including surrogate pairs. The destination buffer
|
||||
* must be large enough to hold the result as measured by utf8_to_utf16_length
|
||||
* plus an added NULL terminator.
|
||||
*/
|
||||
void utf8_to_utf16(const uint8_t* src, size_t srcLen, char16_t* dst);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -41,6 +41,7 @@ commonSources:= \
|
|||
TextOutput.cpp \
|
||||
Threads.cpp \
|
||||
Timers.cpp \
|
||||
Unicode.cpp \
|
||||
VectorImpl.cpp \
|
||||
ZipFileCRO.cpp \
|
||||
ZipFileRO.cpp \
|
||||
|
|
|
@ -444,15 +444,51 @@ void ResStringPool::uninit()
|
|||
}
|
||||
}
|
||||
|
||||
#define DECODE_LENGTH(str, chrsz, len) \
|
||||
len = *(str); \
|
||||
if (*(str)&(1<<(chrsz*8-1))) { \
|
||||
(str)++; \
|
||||
len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \
|
||||
} \
|
||||
(str)++;
|
||||
/**
|
||||
* Strings in UTF-16 format have length indicated by a length encoded in the
|
||||
* stored data. It is either 1 or 2 characters of length data. This allows a
|
||||
* maximum length of 0x7FFFFFF (2147483647 bytes), but if you're storing that
|
||||
* much data in a string, you're abusing them.
|
||||
*
|
||||
* If the high bit is set, then there are two characters or 4 bytes of length
|
||||
* data encoded. In that case, drop the high bit of the first character and
|
||||
* add it together with the next character.
|
||||
*/
|
||||
static inline size_t
|
||||
decodeLength(const char16_t** str)
|
||||
{
|
||||
size_t len = **str;
|
||||
if ((len & 0x8000) != 0) {
|
||||
(*str)++;
|
||||
len = ((len & 0x7FFF) << 16) | **str;
|
||||
}
|
||||
(*str)++;
|
||||
return len;
|
||||
}
|
||||
|
||||
const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
|
||||
/**
|
||||
* Strings in UTF-8 format have length indicated by a length encoded in the
|
||||
* stored data. It is either 1 or 2 characters of length data. This allows a
|
||||
* maximum length of 0x7FFF (32767 bytes), but you should consider storing
|
||||
* text in another way if you're using that much data in a single string.
|
||||
*
|
||||
* If the high bit is set, then there are two characters or 2 bytes of length
|
||||
* data encoded. In that case, drop the high bit of the first character and
|
||||
* add it together with the next character.
|
||||
*/
|
||||
static inline size_t
|
||||
decodeLength(const uint8_t** str)
|
||||
{
|
||||
size_t len = **str;
|
||||
if ((len & 0x80) != 0) {
|
||||
(*str)++;
|
||||
len = ((len & 0x7F) << 8) | **str;
|
||||
}
|
||||
(*str)++;
|
||||
return len;
|
||||
}
|
||||
|
||||
const uint16_t* ResStringPool::stringAt(size_t idx, size_t* u16len) const
|
||||
{
|
||||
if (mError == NO_ERROR && idx < mHeader->stringCount) {
|
||||
const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0;
|
||||
|
@ -461,37 +497,51 @@ const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
|
|||
if (!isUTF8) {
|
||||
const char16_t* strings = (char16_t*)mStrings;
|
||||
const char16_t* str = strings+off;
|
||||
DECODE_LENGTH(str, sizeof(char16_t), *outLen)
|
||||
if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) {
|
||||
|
||||
*u16len = decodeLength(&str);
|
||||
if ((uint32_t)(str+*u16len-strings) < mStringPoolSize) {
|
||||
return str;
|
||||
} else {
|
||||
LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
|
||||
(int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize);
|
||||
(int)idx, (int)(str+*u16len-strings), (int)mStringPoolSize);
|
||||
}
|
||||
} else {
|
||||
const uint8_t* strings = (uint8_t*)mStrings;
|
||||
const uint8_t* str = strings+off;
|
||||
DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
|
||||
size_t encLen;
|
||||
DECODE_LENGTH(str, sizeof(uint8_t), encLen)
|
||||
if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
|
||||
const uint8_t* u8str = strings+off;
|
||||
|
||||
*u16len = decodeLength(&u8str);
|
||||
size_t u8len = decodeLength(&u8str);
|
||||
|
||||
// encLen must be less than 0x7FFF due to encoding.
|
||||
if ((uint32_t)(u8str+u8len-strings) < mStringPoolSize) {
|
||||
AutoMutex lock(mDecodeLock);
|
||||
|
||||
if (mCache[idx] != NULL) {
|
||||
return mCache[idx];
|
||||
}
|
||||
char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t));
|
||||
|
||||
ssize_t actualLen = utf8_to_utf16_length(u8str, u8len);
|
||||
if (actualLen < 0 || (size_t)actualLen != *u16len) {
|
||||
LOGW("Bad string block: string #%lld decoded length is not correct "
|
||||
"%lld vs %llu\n",
|
||||
(long long)idx, (long long)actualLen, (long long)*u16len);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char16_t *u16str = (char16_t *)calloc(*u16len+1, sizeof(char16_t));
|
||||
if (!u16str) {
|
||||
LOGW("No memory when trying to allocate decode cache for string #%d\n",
|
||||
(int)idx);
|
||||
return NULL;
|
||||
}
|
||||
const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str);
|
||||
utf8_to_utf16(u8src, encLen, u16str, *outLen);
|
||||
|
||||
utf8_to_utf16(u8str, u8len, u16str);
|
||||
mCache[idx] = u16str;
|
||||
return u16str;
|
||||
} else {
|
||||
LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
|
||||
(int)idx, (int)(str+encLen-strings), (int)mStringPoolSize);
|
||||
LOGW("Bad string block: string #%lld extends to %lld, past end at %lld\n",
|
||||
(long long)idx, (long long)(u8str+u8len-strings),
|
||||
(long long)mStringPoolSize);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -512,9 +562,8 @@ const char* ResStringPool::string8At(size_t idx, size_t* outLen) const
|
|||
if (isUTF8) {
|
||||
const uint8_t* strings = (uint8_t*)mStrings;
|
||||
const uint8_t* str = strings+off;
|
||||
DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
|
||||
size_t encLen;
|
||||
DECODE_LENGTH(str, sizeof(uint8_t), encLen)
|
||||
*outLen = decodeLength(&str);
|
||||
size_t encLen = decodeLength(&str);
|
||||
if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
|
||||
return (const char*)str;
|
||||
} else {
|
||||
|
|
|
@ -18,228 +18,17 @@
|
|||
|
||||
#include <utils/Debug.h>
|
||||
#include <utils/Log.h>
|
||||
#include <utils/Unicode.h>
|
||||
#include <utils/String8.h>
|
||||
#include <utils/TextOutput.h>
|
||||
#include <utils/threads.h>
|
||||
|
||||
#include <private/utils/Static.h>
|
||||
|
||||
#ifdef HAVE_WINSOCK
|
||||
# undef nhtol
|
||||
# undef htonl
|
||||
# undef nhtos
|
||||
# undef htons
|
||||
|
||||
# ifdef HAVE_LITTLE_ENDIAN
|
||||
# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
|
||||
# define htonl(x) ntohl(x)
|
||||
# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
|
||||
# define htons(x) ntohs(x)
|
||||
# else
|
||||
# define ntohl(x) (x)
|
||||
# define htonl(x) (x)
|
||||
# define ntohs(x) (x)
|
||||
# define htons(x) (x)
|
||||
# endif
|
||||
#else
|
||||
# include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
#include <memory.h>
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
int strcmp16(const char16_t *s1, const char16_t *s2)
|
||||
{
|
||||
char16_t ch;
|
||||
int d = 0;
|
||||
|
||||
while ( 1 ) {
|
||||
d = (int)(ch = *s1++) - (int)*s2++;
|
||||
if ( d || !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
|
||||
{
|
||||
char16_t ch;
|
||||
int d = 0;
|
||||
|
||||
while ( n-- ) {
|
||||
d = (int)(ch = *s1++) - (int)*s2++;
|
||||
if ( d || !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
char16_t *strcpy16(char16_t *dst, const char16_t *src)
|
||||
{
|
||||
char16_t *q = dst;
|
||||
const char16_t *p = src;
|
||||
char16_t ch;
|
||||
|
||||
do {
|
||||
*q++ = ch = *p++;
|
||||
} while ( ch );
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
size_t strlen16(const char16_t *s)
|
||||
{
|
||||
const char16_t *ss = s;
|
||||
while ( *ss )
|
||||
ss++;
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
|
||||
char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
|
||||
{
|
||||
char16_t *q = dst;
|
||||
const char16_t *p = src;
|
||||
char ch;
|
||||
|
||||
while (n) {
|
||||
n--;
|
||||
*q++ = ch = *p++;
|
||||
if ( !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
*q = 0;
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
size_t strnlen16(const char16_t *s, size_t maxlen)
|
||||
{
|
||||
const char16_t *ss = s;
|
||||
|
||||
/* Important: the maxlen test must precede the reference through ss;
|
||||
since the byte beyond the maximum may segfault */
|
||||
while ((maxlen > 0) && *ss) {
|
||||
ss++;
|
||||
maxlen--;
|
||||
}
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
|
||||
{
|
||||
const char16_t* e1 = s1+n1;
|
||||
const char16_t* e2 = s2+n2;
|
||||
|
||||
while (s1 < e1 && s2 < e2) {
|
||||
const int d = (int)*s1++ - (int)*s2++;
|
||||
if (d) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
||||
return n1 < n2
|
||||
? (0 - (int)*s2)
|
||||
: (n1 > n2
|
||||
? ((int)*s1 - 0)
|
||||
: 0);
|
||||
}
|
||||
|
||||
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
|
||||
{
|
||||
const char16_t* e1 = s1H+n1;
|
||||
const char16_t* e2 = s2N+n2;
|
||||
|
||||
while (s1H < e1 && s2N < e2) {
|
||||
const char16_t c2 = ntohs(*s2N);
|
||||
const int d = (int)*s1H++ - (int)c2;
|
||||
s2N++;
|
||||
if (d) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
||||
return n1 < n2
|
||||
? (0 - (int)ntohs(*s2N))
|
||||
: (n1 > n2
|
||||
? ((int)*s1H - 0)
|
||||
: 0);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
utf8_char_len(uint8_t ch)
|
||||
{
|
||||
return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
|
||||
}
|
||||
|
||||
#define UTF8_SHIFT_AND_MASK(unicode, byte) (unicode)<<=6; (unicode) |= (0x3f & (byte));
|
||||
|
||||
static inline uint32_t
|
||||
utf8_to_utf32(const uint8_t *src, size_t length)
|
||||
{
|
||||
uint32_t unicode;
|
||||
|
||||
switch (length)
|
||||
{
|
||||
case 1:
|
||||
return src[0];
|
||||
case 2:
|
||||
unicode = src[0] & 0x1f;
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[1])
|
||||
return unicode;
|
||||
case 3:
|
||||
unicode = src[0] & 0x0f;
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[1])
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[2])
|
||||
return unicode;
|
||||
case 4:
|
||||
unicode = src[0] & 0x07;
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[1])
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[2])
|
||||
UTF8_SHIFT_AND_MASK(unicode, src[3])
|
||||
return unicode;
|
||||
default:
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
|
||||
}
|
||||
|
||||
void
|
||||
utf8_to_utf16(const uint8_t *src, size_t srcLen,
|
||||
char16_t* dst, const size_t dstLen)
|
||||
{
|
||||
const uint8_t* const end = src + srcLen;
|
||||
const char16_t* const dstEnd = dst + dstLen;
|
||||
while (src < end && dst < dstEnd) {
|
||||
size_t len = utf8_char_len(*src);
|
||||
uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len);
|
||||
|
||||
// Convert the UTF32 codepoint to one or more UTF16 codepoints
|
||||
if (codepoint <= 0xFFFF) {
|
||||
// Single UTF16 character
|
||||
*dst++ = (char16_t) codepoint;
|
||||
} else {
|
||||
// Multiple UTF16 characters with surrogates
|
||||
codepoint = codepoint - 0x10000;
|
||||
*dst++ = (char16_t) ((codepoint >> 10) + 0xD800);
|
||||
*dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
|
||||
}
|
||||
|
||||
src += len;
|
||||
}
|
||||
if (dst < dstEnd) {
|
||||
*dst = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
namespace android {
|
||||
|
||||
|
@ -270,37 +59,33 @@ void terminate_string16()
|
|||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static char16_t* allocFromUTF8(const char* in, size_t len)
|
||||
static char16_t* allocFromUTF8(const char* u8str, size_t u8len)
|
||||
{
|
||||
if (len == 0) return getEmptyString();
|
||||
|
||||
size_t chars = 0;
|
||||
const char* end = in+len;
|
||||
const char* p = in;
|
||||
|
||||
while (p < end) {
|
||||
chars++;
|
||||
int utf8len = utf8_char_len(*p);
|
||||
uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, utf8len);
|
||||
if (codepoint > 0xFFFF) chars++; // this will be a surrogate pair in utf16
|
||||
p += utf8len;
|
||||
if (u8len == 0) return getEmptyString();
|
||||
|
||||
const uint8_t* u8cur = (const uint8_t*) u8str;
|
||||
|
||||
const ssize_t u16len = utf8_to_utf16_length(u8cur, u8len);
|
||||
if (u16len < 0) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
size_t bufSize = (chars+1)*sizeof(char16_t);
|
||||
SharedBuffer* buf = SharedBuffer::alloc(bufSize);
|
||||
|
||||
const uint8_t* const u8end = u8cur + u8len;
|
||||
|
||||
SharedBuffer* buf = SharedBuffer::alloc(sizeof(char16_t)*(u16len+1));
|
||||
if (buf) {
|
||||
p = in;
|
||||
char16_t* str = (char16_t*)buf->data();
|
||||
|
||||
utf8_to_utf16((const uint8_t*)p, len, str, bufSize);
|
||||
u8cur = (const uint8_t*) u8str;
|
||||
char16_t* u16str = (char16_t*)buf->data();
|
||||
|
||||
utf8_to_utf16(u8cur, u8len, u16str);
|
||||
|
||||
//printf("Created UTF-16 string from UTF-8 \"%s\":", in);
|
||||
//printHexData(1, str, buf->size(), 16, 1);
|
||||
//printf("\n");
|
||||
|
||||
return str;
|
||||
return u16str;
|
||||
}
|
||||
|
||||
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#include <utils/String8.h>
|
||||
|
||||
#include <utils/Log.h>
|
||||
#include <utils/Unicode.h>
|
||||
#include <utils/SharedBuffer.h>
|
||||
#include <utils/String16.h>
|
||||
#include <utils/TextOutput.h>
|
||||
#include <utils/threads.h>
|
||||
|
@ -34,94 +36,10 @@
|
|||
|
||||
namespace android {
|
||||
|
||||
static const char32_t kByteMask = 0x000000BF;
|
||||
static const char32_t kByteMark = 0x00000080;
|
||||
|
||||
// Surrogates aren't valid for UTF-32 characters, so define some
|
||||
// constants that will let us screen them out.
|
||||
static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
|
||||
static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
|
||||
static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
|
||||
static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
|
||||
static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
|
||||
static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
|
||||
static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
|
||||
|
||||
// Mask used to set appropriate bits in first byte of UTF-8 sequence,
|
||||
// indexed by number of bytes in the sequence.
|
||||
// 0xxxxxxx
|
||||
// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
|
||||
// 110yyyyx 10xxxxxx
|
||||
// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
|
||||
// 1110yyyy 10yxxxxx 10xxxxxx
|
||||
// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
|
||||
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
|
||||
// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
|
||||
static const char32_t kFirstByteMark[] = {
|
||||
0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
|
||||
};
|
||||
|
||||
// Separator used by resource paths. This is not platform dependent contrary
|
||||
// to OS_PATH_SEPARATOR.
|
||||
#define RES_PATH_SEPARATOR '/'
|
||||
|
||||
// Return number of utf8 bytes required for the character.
|
||||
static size_t utf32_to_utf8_bytes(char32_t srcChar)
|
||||
{
|
||||
size_t bytesToWrite;
|
||||
|
||||
// Figure out how many bytes the result will require.
|
||||
if (srcChar < 0x00000080)
|
||||
{
|
||||
bytesToWrite = 1;
|
||||
}
|
||||
else if (srcChar < 0x00000800)
|
||||
{
|
||||
bytesToWrite = 2;
|
||||
}
|
||||
else if (srcChar < 0x00010000)
|
||||
{
|
||||
if ((srcChar < kUnicodeSurrogateStart)
|
||||
|| (srcChar > kUnicodeSurrogateEnd))
|
||||
{
|
||||
bytesToWrite = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Surrogates are invalid UTF-32 characters.
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// Max code point for Unicode is 0x0010FFFF.
|
||||
else if (srcChar <= kUnicodeMaxCodepoint)
|
||||
{
|
||||
bytesToWrite = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invalid UTF-32 character.
|
||||
return 0;
|
||||
}
|
||||
|
||||
return bytesToWrite;
|
||||
}
|
||||
|
||||
// Write out the source character to <dstP>.
|
||||
|
||||
static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
|
||||
{
|
||||
dstP += bytes;
|
||||
switch (bytes)
|
||||
{ /* note: everything falls through. */
|
||||
case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static SharedBuffer* gEmptyStringBuf = NULL;
|
||||
static char* gEmptyString = NULL;
|
||||
|
||||
|
@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len)
|
|||
return getEmptyString();
|
||||
}
|
||||
|
||||
template<typename T, typename L>
|
||||
static char* allocFromUTF16OrUTF32(const T* in, L len)
|
||||
{
|
||||
if (len == 0) return getEmptyString();
|
||||
|
||||
size_t bytes = 0;
|
||||
const T* end = in+len;
|
||||
const T* p = in;
|
||||
|
||||
while (p < end) {
|
||||
bytes += utf32_to_utf8_bytes(*p);
|
||||
p++;
|
||||
}
|
||||
|
||||
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
|
||||
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
||||
if (buf) {
|
||||
p = in;
|
||||
char* str = (char*)buf->data();
|
||||
char* d = str;
|
||||
while (p < end) {
|
||||
const T c = *p++;
|
||||
size_t len = utf32_to_utf8_bytes(c);
|
||||
utf32_to_utf8((uint8_t*)d, c, len);
|
||||
d += len;
|
||||
}
|
||||
*d = 0;
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
static char* allocFromUTF16(const char16_t* in, size_t len)
|
||||
{
|
||||
if (len == 0) return getEmptyString();
|
||||
|
||||
const size_t bytes = utf8_length_from_utf16(in, len);
|
||||
const ssize_t bytes = utf16_to_utf8_length(in, len);
|
||||
if (bytes < 0) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
|
||||
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
||||
if (buf) {
|
||||
char* str = (char*)buf->data();
|
||||
|
||||
utf16_to_utf8(in, len, str, bytes+1);
|
||||
|
||||
return str;
|
||||
if (!buf) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
return getEmptyString();
|
||||
char* str = (char*)buf->data();
|
||||
utf16_to_utf8(in, len, str);
|
||||
return str;
|
||||
}
|
||||
|
||||
static char* allocFromUTF32(const char32_t* in, size_t len)
|
||||
{
|
||||
return allocFromUTF16OrUTF32<char32_t, size_t>(in, len);
|
||||
if (len == 0) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
const ssize_t bytes = utf32_to_utf8_length(in, len);
|
||||
if (bytes < 0) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
|
||||
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
||||
if (!buf) {
|
||||
return getEmptyString();
|
||||
}
|
||||
|
||||
char* str = (char*) buf->data();
|
||||
utf32_to_utf8(in, len, str);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length)
|
|||
|
||||
size_t String8::getUtf32Length() const
|
||||
{
|
||||
return utf32_length(mString, length());
|
||||
return utf8_to_utf32_length(mString, length());
|
||||
}
|
||||
|
||||
int32_t String8::getUtf32At(size_t index, size_t *next_index) const
|
||||
{
|
||||
return utf32_at(mString, length(), index, next_index);
|
||||
return utf32_from_utf8_at(mString, length(), index, next_index);
|
||||
}
|
||||
|
||||
size_t String8::getUtf32(char32_t* dst, size_t dst_len) const
|
||||
void String8::getUtf32(char32_t* dst) const
|
||||
{
|
||||
return utf8_to_utf32(mString, length(), dst, dst_len);
|
||||
utf8_to_utf32(mString, length(), dst);
|
||||
}
|
||||
|
||||
TextOutput& operator<<(TextOutput& to, const String8& val)
|
||||
|
@ -705,241 +608,3 @@ String8& String8::convertToResPath()
|
|||
}
|
||||
|
||||
}; // namespace android
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
size_t strlen32(const char32_t *s)
|
||||
{
|
||||
const char32_t *ss = s;
|
||||
while ( *ss )
|
||||
ss++;
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
size_t strnlen32(const char32_t *s, size_t maxlen)
|
||||
{
|
||||
const char32_t *ss = s;
|
||||
while ((maxlen > 0) && *ss) {
|
||||
ss++;
|
||||
maxlen--;
|
||||
}
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
size_t utf8_length(const char *src)
|
||||
{
|
||||
const char *cur = src;
|
||||
size_t ret = 0;
|
||||
while (*cur != '\0') {
|
||||
const char first_char = *cur++;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
ret += 1;
|
||||
continue;
|
||||
}
|
||||
// (UTF-8's character must not be like 10xxxxxx,
|
||||
// but 110xxxxx, 1110xxxx, ... or 1111110x)
|
||||
if ((first_char & 0x40) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mask, to_ignore_mask;
|
||||
size_t num_to_read = 0;
|
||||
char32_t utf32 = 0;
|
||||
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
|
||||
num_to_read < 5 && (first_char & mask);
|
||||
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
||||
if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
|
||||
return 0;
|
||||
}
|
||||
// 0x3F == 00111111
|
||||
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
||||
}
|
||||
// "first_char" must be (110xxxxx - 11110xxx)
|
||||
if (num_to_read == 5) {
|
||||
return 0;
|
||||
}
|
||||
to_ignore_mask |= mask;
|
||||
utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
|
||||
if (utf32 > android::kUnicodeMaxCodepoint) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret += num_to_read;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t utf32_length(const char *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
const char* cur;
|
||||
const char* end;
|
||||
size_t num_to_skip;
|
||||
for (cur = src, end = src + src_len, num_to_skip = 1;
|
||||
cur < end;
|
||||
cur += num_to_skip, ret++) {
|
||||
const char first_char = *cur;
|
||||
num_to_skip = 1;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
continue;
|
||||
}
|
||||
int32_t mask;
|
||||
|
||||
for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t utf8_length_from_utf32(const char32_t *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
const char32_t *end = src + src_len;
|
||||
while (src < end) {
|
||||
ret += android::utf32_to_utf8_bytes(*src++);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
const char16_t* const end = src + src_len;
|
||||
while (src < end) {
|
||||
if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
|
||||
&& (*++src & 0xFC00) == 0xDC00) {
|
||||
// surrogate pairs are always 4 bytes.
|
||||
ret += 4;
|
||||
src++;
|
||||
} else {
|
||||
ret += android::utf32_to_utf8_bytes((char32_t) *src++);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int32_t utf32_at_internal(const char* cur, size_t *num_read)
|
||||
{
|
||||
const char first_char = *cur;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
*num_read = 1;
|
||||
return *cur;
|
||||
}
|
||||
cur++;
|
||||
char32_t mask, to_ignore_mask;
|
||||
size_t num_to_read = 0;
|
||||
char32_t utf32 = first_char;
|
||||
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
|
||||
(first_char & mask);
|
||||
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
||||
// 0x3F == 00111111
|
||||
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
||||
}
|
||||
to_ignore_mask |= mask;
|
||||
utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
|
||||
|
||||
*num_read = num_to_read;
|
||||
return static_cast<int32_t>(utf32);
|
||||
}
|
||||
|
||||
int32_t utf32_at(const char *src, size_t src_len,
|
||||
size_t index, size_t *next_index)
|
||||
{
|
||||
if (index >= src_len) {
|
||||
return -1;
|
||||
}
|
||||
size_t dummy_index;
|
||||
if (next_index == NULL) {
|
||||
next_index = &dummy_index;
|
||||
}
|
||||
size_t num_read;
|
||||
int32_t ret = utf32_at_internal(src + index, &num_read);
|
||||
if (ret >= 0) {
|
||||
*next_index = index + num_read;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t utf8_to_utf32(const char* src, size_t src_len,
|
||||
char32_t* dst, size_t dst_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char* cur = src;
|
||||
const char* end = src + src_len;
|
||||
char32_t* cur_utf32 = dst;
|
||||
const char32_t* end_utf32 = dst + dst_len;
|
||||
while (cur_utf32 < end_utf32 && cur < end) {
|
||||
size_t num_read;
|
||||
*cur_utf32++ =
|
||||
static_cast<char32_t>(utf32_at_internal(cur, &num_read));
|
||||
cur += num_read;
|
||||
}
|
||||
if (cur_utf32 < end_utf32) {
|
||||
*cur_utf32 = 0;
|
||||
}
|
||||
return static_cast<size_t>(cur_utf32 - dst);
|
||||
}
|
||||
|
||||
size_t utf32_to_utf8(const char32_t* src, size_t src_len,
|
||||
char* dst, size_t dst_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
const char32_t *cur_utf32 = src;
|
||||
const char32_t *end_utf32 = src + src_len;
|
||||
char *cur = dst;
|
||||
const char *end = dst + dst_len;
|
||||
while (cur_utf32 < end_utf32 && cur < end) {
|
||||
size_t len = android::utf32_to_utf8_bytes(*cur_utf32);
|
||||
android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len);
|
||||
cur += len;
|
||||
}
|
||||
if (cur < end) {
|
||||
*cur = '\0';
|
||||
}
|
||||
return cur - dst;
|
||||
}
|
||||
|
||||
size_t utf16_to_utf8(const char16_t* src, size_t src_len,
|
||||
char* dst, size_t dst_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
const char16_t* cur_utf16 = src;
|
||||
const char16_t* const end_utf16 = src + src_len;
|
||||
char *cur = dst;
|
||||
const char* const end = dst + dst_len;
|
||||
while (cur_utf16 < end_utf16 && cur < end) {
|
||||
char32_t utf32;
|
||||
// surrogate pairs
|
||||
if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
|
||||
utf32 = (*cur_utf16++ - 0xD800) << 10;
|
||||
utf32 |= *cur_utf16++ - 0xDC00;
|
||||
utf32 += 0x10000;
|
||||
} else {
|
||||
utf32 = (char32_t) *cur_utf16++;
|
||||
}
|
||||
size_t len = android::utf32_to_utf8_bytes(utf32);
|
||||
android::utf32_to_utf8((uint8_t*)cur, utf32, len);
|
||||
cur += len;
|
||||
}
|
||||
if (cur < end) {
|
||||
*cur = '\0';
|
||||
}
|
||||
return cur - dst;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,575 @@
|
|||
/*
|
||||
* Copyright (C) 2005 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <utils/Unicode.h>
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef HAVE_WINSOCK
|
||||
# undef nhtol
|
||||
# undef htonl
|
||||
# undef nhtos
|
||||
# undef htons
|
||||
|
||||
# ifdef HAVE_LITTLE_ENDIAN
|
||||
# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
|
||||
# define htonl(x) ntohl(x)
|
||||
# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
|
||||
# define htons(x) ntohs(x)
|
||||
# else
|
||||
# define ntohl(x) (x)
|
||||
# define htonl(x) (x)
|
||||
# define ntohs(x) (x)
|
||||
# define htons(x) (x)
|
||||
# endif
|
||||
#else
|
||||
# include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
static const char32_t kByteMask = 0x000000BF;
|
||||
static const char32_t kByteMark = 0x00000080;
|
||||
|
||||
// Surrogates aren't valid for UTF-32 characters, so define some
|
||||
// constants that will let us screen them out.
|
||||
static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
|
||||
static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
|
||||
static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
|
||||
static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
|
||||
static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
|
||||
static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
|
||||
static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
|
||||
|
||||
// Mask used to set appropriate bits in first byte of UTF-8 sequence,
|
||||
// indexed by number of bytes in the sequence.
|
||||
// 0xxxxxxx
|
||||
// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
|
||||
// 110yyyyx 10xxxxxx
|
||||
// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
|
||||
// 1110yyyy 10yxxxxx 10xxxxxx
|
||||
// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
|
||||
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
|
||||
// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
|
||||
static const char32_t kFirstByteMark[] = {
|
||||
0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// UTF-32
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return number of UTF-8 bytes required for the character. If the character
|
||||
* is invalid, return size of 0.
|
||||
*/
|
||||
static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
|
||||
{
|
||||
// Figure out how many bytes the result will require.
|
||||
if (srcChar < 0x00000080) {
|
||||
return 1;
|
||||
} else if (srcChar < 0x00000800) {
|
||||
return 2;
|
||||
} else if (srcChar < 0x00010000) {
|
||||
if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) {
|
||||
return 3;
|
||||
} else {
|
||||
// Surrogates are invalid UTF-32 characters.
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// Max code point for Unicode is 0x0010FFFF.
|
||||
else if (srcChar <= kUnicodeMaxCodepoint) {
|
||||
return 4;
|
||||
} else {
|
||||
// Invalid UTF-32 character.
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Write out the source character to <dstP>.
|
||||
|
||||
static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
|
||||
{
|
||||
dstP += bytes;
|
||||
switch (bytes)
|
||||
{ /* note: everything falls through. */
|
||||
case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
||||
case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t strlen32(const char32_t *s)
|
||||
{
|
||||
const char32_t *ss = s;
|
||||
while ( *ss )
|
||||
ss++;
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
size_t strnlen32(const char32_t *s, size_t maxlen)
|
||||
{
|
||||
const char32_t *ss = s;
|
||||
while ((maxlen > 0) && *ss) {
|
||||
ss++;
|
||||
maxlen--;
|
||||
}
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
|
||||
{
|
||||
const char first_char = *cur;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
*num_read = 1;
|
||||
return *cur;
|
||||
}
|
||||
cur++;
|
||||
char32_t mask, to_ignore_mask;
|
||||
size_t num_to_read = 0;
|
||||
char32_t utf32 = first_char;
|
||||
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
|
||||
(first_char & mask);
|
||||
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
||||
// 0x3F == 00111111
|
||||
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
||||
}
|
||||
to_ignore_mask |= mask;
|
||||
utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
|
||||
|
||||
*num_read = num_to_read;
|
||||
return static_cast<int32_t>(utf32);
|
||||
}
|
||||
|
||||
int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index)
|
||||
{
|
||||
if (index >= src_len) {
|
||||
return -1;
|
||||
}
|
||||
size_t dummy_index;
|
||||
if (next_index == NULL) {
|
||||
next_index = &dummy_index;
|
||||
}
|
||||
size_t num_read;
|
||||
int32_t ret = utf32_at_internal(src + index, &num_read);
|
||||
if (ret >= 0) {
|
||||
*next_index = index + num_read;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t ret = 0;
|
||||
const char32_t *end = src + src_len;
|
||||
while (src < end) {
|
||||
ret += utf32_codepoint_utf8_length(*src++);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char32_t *cur_utf32 = src;
|
||||
const char32_t *end_utf32 = src + src_len;
|
||||
char *cur = dst;
|
||||
while (cur_utf32 < end_utf32) {
|
||||
size_t len = utf32_codepoint_utf8_length(*cur_utf32);
|
||||
utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len);
|
||||
cur += len;
|
||||
}
|
||||
*cur = '\0';
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// UTF-16
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
int strcmp16(const char16_t *s1, const char16_t *s2)
|
||||
{
|
||||
char16_t ch;
|
||||
int d = 0;
|
||||
|
||||
while ( 1 ) {
|
||||
d = (int)(ch = *s1++) - (int)*s2++;
|
||||
if ( d || !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
int strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
|
||||
{
|
||||
char16_t ch;
|
||||
int d = 0;
|
||||
|
||||
while ( n-- ) {
|
||||
d = (int)(ch = *s1++) - (int)*s2++;
|
||||
if ( d || !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
char16_t *strcpy16(char16_t *dst, const char16_t *src)
|
||||
{
|
||||
char16_t *q = dst;
|
||||
const char16_t *p = src;
|
||||
char16_t ch;
|
||||
|
||||
do {
|
||||
*q++ = ch = *p++;
|
||||
} while ( ch );
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
size_t strlen16(const char16_t *s)
|
||||
{
|
||||
const char16_t *ss = s;
|
||||
while ( *ss )
|
||||
ss++;
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
|
||||
char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
|
||||
{
|
||||
char16_t *q = dst;
|
||||
const char16_t *p = src;
|
||||
char ch;
|
||||
|
||||
while (n) {
|
||||
n--;
|
||||
*q++ = ch = *p++;
|
||||
if ( !ch )
|
||||
break;
|
||||
}
|
||||
|
||||
*q = 0;
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
size_t strnlen16(const char16_t *s, size_t maxlen)
|
||||
{
|
||||
const char16_t *ss = s;
|
||||
|
||||
/* Important: the maxlen test must precede the reference through ss;
|
||||
since the byte beyond the maximum may segfault */
|
||||
while ((maxlen > 0) && *ss) {
|
||||
ss++;
|
||||
maxlen--;
|
||||
}
|
||||
return ss-s;
|
||||
}
|
||||
|
||||
int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
|
||||
{
|
||||
const char16_t* e1 = s1+n1;
|
||||
const char16_t* e2 = s2+n2;
|
||||
|
||||
while (s1 < e1 && s2 < e2) {
|
||||
const int d = (int)*s1++ - (int)*s2++;
|
||||
if (d) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
||||
return n1 < n2
|
||||
? (0 - (int)*s2)
|
||||
: (n1 > n2
|
||||
? ((int)*s1 - 0)
|
||||
: 0);
|
||||
}
|
||||
|
||||
int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
|
||||
{
|
||||
const char16_t* e1 = s1H+n1;
|
||||
const char16_t* e2 = s2N+n2;
|
||||
|
||||
while (s1H < e1 && s2N < e2) {
|
||||
const char16_t c2 = ntohs(*s2N);
|
||||
const int d = (int)*s1H++ - (int)c2;
|
||||
s2N++;
|
||||
if (d) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
||||
return n1 < n2
|
||||
? (0 - (int)ntohs(*s2N))
|
||||
: (n1 > n2
|
||||
? ((int)*s1H - 0)
|
||||
: 0);
|
||||
}
|
||||
|
||||
void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char16_t* cur_utf16 = src;
|
||||
const char16_t* const end_utf16 = src + src_len;
|
||||
char *cur = dst;
|
||||
while (cur_utf16 < end_utf16) {
|
||||
char32_t utf32;
|
||||
// surrogate pairs
|
||||
if ((*cur_utf16 & 0xFC00) == 0xD800) {
|
||||
utf32 = (*cur_utf16++ - 0xD800) << 10;
|
||||
utf32 |= *cur_utf16++ - 0xDC00;
|
||||
utf32 += 0x10000;
|
||||
} else {
|
||||
utf32 = (char32_t) *cur_utf16++;
|
||||
}
|
||||
const size_t len = utf32_codepoint_utf8_length(utf32);
|
||||
utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
|
||||
cur += len;
|
||||
}
|
||||
*cur = '\0';
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// UTF-8
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
ssize_t utf8_length(const char *src)
|
||||
{
|
||||
const char *cur = src;
|
||||
size_t ret = 0;
|
||||
while (*cur != '\0') {
|
||||
const char first_char = *cur++;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
ret += 1;
|
||||
continue;
|
||||
}
|
||||
// (UTF-8's character must not be like 10xxxxxx,
|
||||
// but 110xxxxx, 1110xxxx, ... or 1111110x)
|
||||
if ((first_char & 0x40) == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t mask, to_ignore_mask;
|
||||
size_t num_to_read = 0;
|
||||
char32_t utf32 = 0;
|
||||
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
|
||||
num_to_read < 5 && (first_char & mask);
|
||||
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
||||
if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
|
||||
return -1;
|
||||
}
|
||||
// 0x3F == 00111111
|
||||
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
||||
}
|
||||
// "first_char" must be (110xxxxx - 11110xxx)
|
||||
if (num_to_read == 5) {
|
||||
return -1;
|
||||
}
|
||||
to_ignore_mask |= mask;
|
||||
utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
|
||||
if (utf32 > kUnicodeMaxCodepoint) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret += num_to_read;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t ret = 0;
|
||||
const char16_t* const end = src + src_len;
|
||||
while (src < end) {
|
||||
if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
|
||||
&& (*++src & 0xFC00) == 0xDC00) {
|
||||
// surrogate pairs are always 4 bytes.
|
||||
ret += 4;
|
||||
src++;
|
||||
} else {
|
||||
ret += utf32_codepoint_utf8_length((char32_t) *src++);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns 1-4 based on the number of leading bits.
|
||||
*
|
||||
* 1111 -> 4
|
||||
* 1110 -> 3
|
||||
* 110x -> 2
|
||||
* 10xx -> 1
|
||||
* 0xxx -> 1
|
||||
*/
|
||||
static inline size_t utf8_codepoint_len(uint8_t ch)
|
||||
{
|
||||
return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
|
||||
}
|
||||
|
||||
static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
|
||||
{
|
||||
*codePoint <<= 6;
|
||||
*codePoint |= 0x3F & byte;
|
||||
}
|
||||
|
||||
size_t utf8_to_utf32_length(const char *src, size_t src_len)
|
||||
{
|
||||
if (src == NULL || src_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
const char* cur;
|
||||
const char* end;
|
||||
size_t num_to_skip;
|
||||
for (cur = src, end = src + src_len, num_to_skip = 1;
|
||||
cur < end;
|
||||
cur += num_to_skip, ret++) {
|
||||
const char first_char = *cur;
|
||||
num_to_skip = 1;
|
||||
if ((first_char & 0x80) == 0) { // ASCII
|
||||
continue;
|
||||
}
|
||||
int32_t mask;
|
||||
|
||||
for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
|
||||
{
|
||||
if (src == NULL || src_len == 0 || dst == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char* cur = src;
|
||||
const char* const end = src + src_len;
|
||||
char32_t* cur_utf32 = dst;
|
||||
while (cur < end) {
|
||||
size_t num_read;
|
||||
*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
|
||||
cur += num_read;
|
||||
}
|
||||
*cur_utf32 = 0;
|
||||
}
|
||||
|
||||
static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
|
||||
{
|
||||
uint32_t unicode;
|
||||
|
||||
switch (length)
|
||||
{
|
||||
case 1:
|
||||
return src[0];
|
||||
case 2:
|
||||
unicode = src[0] & 0x1f;
|
||||
utf8_shift_and_mask(&unicode, src[1]);
|
||||
return unicode;
|
||||
case 3:
|
||||
unicode = src[0] & 0x0f;
|
||||
utf8_shift_and_mask(&unicode, src[1]);
|
||||
utf8_shift_and_mask(&unicode, src[2]);
|
||||
return unicode;
|
||||
case 4:
|
||||
unicode = src[0] & 0x07;
|
||||
utf8_shift_and_mask(&unicode, src[1]);
|
||||
utf8_shift_and_mask(&unicode, src[2]);
|
||||
utf8_shift_and_mask(&unicode, src[3]);
|
||||
return unicode;
|
||||
default:
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
|
||||
}
|
||||
|
||||
ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
|
||||
{
|
||||
const uint8_t* const u8end = u8str + u8len;
|
||||
const uint8_t* u8cur = u8str;
|
||||
|
||||
/* Validate that the UTF-8 is the correct len */
|
||||
size_t u16measuredLen = 0;
|
||||
while (u8cur < u8end) {
|
||||
u16measuredLen++;
|
||||
int u8charLen = utf8_codepoint_len(*u8cur);
|
||||
uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
|
||||
if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
|
||||
u8cur += u8charLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure that we ended where we thought we would and the output UTF-16
|
||||
* will be exactly how long we were told it would be.
|
||||
*/
|
||||
if (u8cur != u8end) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return u16measuredLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-8 string to UTF-16. The destination UTF-16 buffer must have
|
||||
* space for NULL at the end.
|
||||
*/
|
||||
void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str)
|
||||
{
|
||||
const uint8_t* const u8end = u8str + u8len;
|
||||
const uint8_t* u8cur = u8str;
|
||||
char16_t* u16cur = u16str;
|
||||
|
||||
while (u8cur < u8end) {
|
||||
size_t u8len = utf8_codepoint_len(*u8cur);
|
||||
uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
|
||||
|
||||
// Convert the UTF32 codepoint to one or more UTF16 codepoints
|
||||
if (codepoint <= 0xFFFF) {
|
||||
// Single UTF16 character
|
||||
*u16cur++ = (char16_t) codepoint;
|
||||
} else {
|
||||
// Multiple UTF16 characters with surrogates
|
||||
codepoint = codepoint - 0x10000;
|
||||
*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
|
||||
*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
|
||||
}
|
||||
|
||||
u8cur += u8len;
|
||||
}
|
||||
*u16cur = 0;
|
||||
}
|
||||
|
||||
}
|
|
@ -8,7 +8,8 @@ ifneq ($(TARGET_SIMULATOR),true)
|
|||
test_src_files := \
|
||||
ObbFile_test.cpp \
|
||||
Looper_test.cpp \
|
||||
String8_test.cpp
|
||||
String8_test.cpp \
|
||||
Unicode_test.cpp
|
||||
|
||||
shared_libraries := \
|
||||
libz \
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright (C) 2010 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#define LOG_TAG "Unicode_test"
|
||||
#include <utils/Log.h>
|
||||
#include <utils/Unicode.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace android {
|
||||
|
||||
class UnicodeTest : public testing::Test {
|
||||
protected:
|
||||
virtual void SetUp() {
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16ZeroLength) {
|
||||
ssize_t measured;
|
||||
|
||||
const uint8_t str[] = { };
|
||||
|
||||
measured = utf8_to_utf16_length(str, 0);
|
||||
EXPECT_EQ(0, measured)
|
||||
<< "Zero length input should return zero length output.";
|
||||
}
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16ASCIILength) {
|
||||
ssize_t measured;
|
||||
|
||||
// U+0030 or ASCII '0'
|
||||
const uint8_t str[] = { 0x30 };
|
||||
|
||||
measured = utf8_to_utf16_length(str, sizeof(str));
|
||||
EXPECT_EQ(1, measured)
|
||||
<< "ASCII glyphs should have a length of 1 char16_t";
|
||||
}
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16Plane1Length) {
|
||||
ssize_t measured;
|
||||
|
||||
// U+2323 SMILE
|
||||
const uint8_t str[] = { 0xE2, 0x8C, 0xA3 };
|
||||
|
||||
measured = utf8_to_utf16_length(str, sizeof(str));
|
||||
EXPECT_EQ(1, measured)
|
||||
<< "Plane 1 glyphs should have a length of 1 char16_t";
|
||||
}
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16SurrogateLength) {
|
||||
ssize_t measured;
|
||||
|
||||
// U+10000
|
||||
const uint8_t str[] = { 0xF0, 0x90, 0x80, 0x80 };
|
||||
|
||||
measured = utf8_to_utf16_length(str, sizeof(str));
|
||||
EXPECT_EQ(2, measured)
|
||||
<< "Surrogate pairs should have a length of 2 char16_t";
|
||||
}
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16TruncatedUTF8) {
|
||||
ssize_t measured;
|
||||
|
||||
// Truncated U+2323 SMILE
|
||||
// U+2323 SMILE
|
||||
const uint8_t str[] = { 0xE2, 0x8C };
|
||||
|
||||
measured = utf8_to_utf16_length(str, sizeof(str));
|
||||
EXPECT_EQ(-1, measured)
|
||||
<< "Truncated UTF-8 should return -1 to indicate invalid";
|
||||
}
|
||||
|
||||
TEST_F(UnicodeTest, UTF8toUTF16Normal) {
|
||||
const uint8_t str[] = {
|
||||
0x30, // U+0030, 1 UTF-16 character
|
||||
0xC4, 0x80, // U+0100, 1 UTF-16 character
|
||||
0xE2, 0x8C, 0xA3, // U+2323, 1 UTF-16 character
|
||||
0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character
|
||||
};
|
||||
|
||||
char16_t output[1 + 1 + 1 + 2 + 1]; // Room for NULL
|
||||
|
||||
utf8_to_utf16(str, sizeof(str), output);
|
||||
|
||||
EXPECT_EQ(0x0030, output[0])
|
||||
<< "should be U+0030";
|
||||
EXPECT_EQ(0x0100, output[1])
|
||||
<< "should be U+0100";
|
||||
EXPECT_EQ(0x2323, output[2])
|
||||
<< "should be U+2323";
|
||||
EXPECT_EQ(0xD800, output[3])
|
||||
<< "should be first half of surrogate U+10000";
|
||||
EXPECT_EQ(0xDC00, output[4])
|
||||
<< "should be second half of surrogate U+10000";
|
||||
EXPECT_EQ(NULL, output[5])
|
||||
<< "should be NULL terminated";
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue