92f5984d2c
Allows the use of UTF-8 for packing resources instead of the default of UTF-16 for Java. When strings are extracted from the ResStringPool, they are converted to UTF-16 and the result is cached for subsequent calls. When using aapt to package, add in the "-8" switch to pack the resources using UTF-8. This will result in the value, key, and type strings as well as the compiled XML string values taking significantly less space in the final application package in most scenarios. Change-Id: I129483f8b3d3b1c5869dced05cb525e494a6c83a
914 lines
22 KiB
C++
914 lines
22 KiB
C++
/*
|
|
* Copyright (C) 2005 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <utils/String8.h>
|
|
|
|
#include <utils/Log.h>
|
|
#include <utils/String16.h>
|
|
#include <utils/TextOutput.h>
|
|
#include <utils/threads.h>
|
|
|
|
#include <private/utils/Static.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
/*
|
|
* Functions outside android is below the namespace android, since they use
|
|
* functions and constants in android namespace.
|
|
*/
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
namespace android {
|
|
|
|
static const char32_t kByteMask = 0x000000BF;
|
|
static const char32_t kByteMark = 0x00000080;
|
|
|
|
// Surrogates aren't valid for UTF-32 characters, so define some
|
|
// constants that will let us screen them out.
|
|
static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
|
|
static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
|
|
static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
|
|
static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
|
|
static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
|
|
static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
|
|
static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
|
|
|
|
// Mask used to set appropriate bits in first byte of UTF-8 sequence,
|
|
// indexed by number of bytes in the sequence.
|
|
// 0xxxxxxx
|
|
// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
|
|
// 110yyyyx 10xxxxxx
|
|
// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
|
|
// 1110yyyy 10yxxxxx 10xxxxxx
|
|
// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
|
|
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
|
|
// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
|
|
static const char32_t kFirstByteMark[] = {
|
|
0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
|
|
};
|
|
|
|
// Separator used by resource paths. This is not platform dependent contrary
|
|
// to OS_PATH_SEPARATOR.
|
|
#define RES_PATH_SEPARATOR '/'
|
|
|
|
// Return number of utf8 bytes required for the character.
|
|
static size_t utf32_to_utf8_bytes(char32_t srcChar)
|
|
{
|
|
size_t bytesToWrite;
|
|
|
|
// Figure out how many bytes the result will require.
|
|
if (srcChar < 0x00000080)
|
|
{
|
|
bytesToWrite = 1;
|
|
}
|
|
else if (srcChar < 0x00000800)
|
|
{
|
|
bytesToWrite = 2;
|
|
}
|
|
else if (srcChar < 0x00010000)
|
|
{
|
|
if ((srcChar < kUnicodeSurrogateStart)
|
|
|| (srcChar > kUnicodeSurrogateEnd))
|
|
{
|
|
bytesToWrite = 3;
|
|
}
|
|
else
|
|
{
|
|
// Surrogates are invalid UTF-32 characters.
|
|
return 0;
|
|
}
|
|
}
|
|
// Max code point for Unicode is 0x0010FFFF.
|
|
else if (srcChar <= kUnicodeMaxCodepoint)
|
|
{
|
|
bytesToWrite = 4;
|
|
}
|
|
else
|
|
{
|
|
// Invalid UTF-32 character.
|
|
return 0;
|
|
}
|
|
|
|
return bytesToWrite;
|
|
}
|
|
|
|
// Write out the source character to <dstP>.
|
|
|
|
static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
|
|
{
|
|
dstP += bytes;
|
|
switch (bytes)
|
|
{ /* note: everything falls through. */
|
|
case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
|
case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
|
case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
|
|
case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
static SharedBuffer* gEmptyStringBuf = NULL;
|
|
static char* gEmptyString = NULL;
|
|
|
|
extern int gDarwinCantLoadAllObjects;
|
|
int gDarwinIsReallyAnnoying;
|
|
|
|
static inline char* getEmptyString()
|
|
{
|
|
gEmptyStringBuf->acquire();
|
|
return gEmptyString;
|
|
}
|
|
|
|
void initialize_string8()
|
|
{
|
|
#ifdef LIBUTILS_NATIVE
|
|
// Bite me, Darwin!
|
|
gDarwinIsReallyAnnoying = gDarwinCantLoadAllObjects;
|
|
#endif
|
|
|
|
SharedBuffer* buf = SharedBuffer::alloc(1);
|
|
char* str = (char*)buf->data();
|
|
*str = 0;
|
|
gEmptyStringBuf = buf;
|
|
gEmptyString = str;
|
|
}
|
|
|
|
void terminate_string8()
|
|
{
|
|
SharedBuffer::bufferFromData(gEmptyString)->release();
|
|
gEmptyStringBuf = NULL;
|
|
gEmptyString = NULL;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
static char* allocFromUTF8(const char* in, size_t len)
|
|
{
|
|
if (len > 0) {
|
|
SharedBuffer* buf = SharedBuffer::alloc(len+1);
|
|
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
|
if (buf) {
|
|
char* str = (char*)buf->data();
|
|
memcpy(str, in, len);
|
|
str[len] = 0;
|
|
return str;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
return getEmptyString();
|
|
}
|
|
|
|
template<typename T, typename L>
|
|
static char* allocFromUTF16OrUTF32(const T* in, L len)
|
|
{
|
|
if (len == 0) return getEmptyString();
|
|
|
|
size_t bytes = 0;
|
|
const T* end = in+len;
|
|
const T* p = in;
|
|
|
|
while (p < end) {
|
|
bytes += utf32_to_utf8_bytes(*p);
|
|
p++;
|
|
}
|
|
|
|
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
|
|
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
|
if (buf) {
|
|
p = in;
|
|
char* str = (char*)buf->data();
|
|
char* d = str;
|
|
while (p < end) {
|
|
const T c = *p++;
|
|
size_t len = utf32_to_utf8_bytes(c);
|
|
utf32_to_utf8((uint8_t*)d, c, len);
|
|
d += len;
|
|
}
|
|
*d = 0;
|
|
|
|
return str;
|
|
}
|
|
|
|
return getEmptyString();
|
|
}
|
|
|
|
static char* allocFromUTF16(const char16_t* in, size_t len)
|
|
{
|
|
if (len == 0) return getEmptyString();
|
|
|
|
const size_t bytes = utf8_length_from_utf16(in, len);
|
|
|
|
SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
|
|
LOG_ASSERT(buf, "Unable to allocate shared buffer");
|
|
if (buf) {
|
|
char* str = (char*)buf->data();
|
|
|
|
utf16_to_utf8(in, len, str, bytes+1);
|
|
|
|
return str;
|
|
}
|
|
|
|
return getEmptyString();
|
|
}
|
|
|
|
static char* allocFromUTF32(const char32_t* in, size_t len)
|
|
{
|
|
return allocFromUTF16OrUTF32<char32_t, size_t>(in, len);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
String8::String8()
|
|
: mString(getEmptyString())
|
|
{
|
|
}
|
|
|
|
String8::String8(const String8& o)
|
|
: mString(o.mString)
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->acquire();
|
|
}
|
|
|
|
String8::String8(const char* o)
|
|
: mString(allocFromUTF8(o, strlen(o)))
|
|
{
|
|
if (mString == NULL) {
|
|
mString = getEmptyString();
|
|
}
|
|
}
|
|
|
|
String8::String8(const char* o, size_t len)
|
|
: mString(allocFromUTF8(o, len))
|
|
{
|
|
if (mString == NULL) {
|
|
mString = getEmptyString();
|
|
}
|
|
}
|
|
|
|
String8::String8(const String16& o)
|
|
: mString(allocFromUTF16(o.string(), o.size()))
|
|
{
|
|
}
|
|
|
|
String8::String8(const char16_t* o)
|
|
: mString(allocFromUTF16(o, strlen16(o)))
|
|
{
|
|
}
|
|
|
|
String8::String8(const char16_t* o, size_t len)
|
|
: mString(allocFromUTF16(o, len))
|
|
{
|
|
}
|
|
|
|
String8::String8(const char32_t* o)
|
|
: mString(allocFromUTF32(o, strlen32(o)))
|
|
{
|
|
}
|
|
|
|
String8::String8(const char32_t* o, size_t len)
|
|
: mString(allocFromUTF32(o, len))
|
|
{
|
|
}
|
|
|
|
String8::~String8()
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
}
|
|
|
|
void String8::setTo(const String8& other)
|
|
{
|
|
SharedBuffer::bufferFromData(other.mString)->acquire();
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
mString = other.mString;
|
|
}
|
|
|
|
status_t String8::setTo(const char* other)
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
mString = allocFromUTF8(other, strlen(other));
|
|
if (mString) return NO_ERROR;
|
|
|
|
mString = getEmptyString();
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
status_t String8::setTo(const char* other, size_t len)
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
mString = allocFromUTF8(other, len);
|
|
if (mString) return NO_ERROR;
|
|
|
|
mString = getEmptyString();
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
status_t String8::setTo(const char16_t* other, size_t len)
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
mString = allocFromUTF16(other, len);
|
|
if (mString) return NO_ERROR;
|
|
|
|
mString = getEmptyString();
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
status_t String8::setTo(const char32_t* other, size_t len)
|
|
{
|
|
SharedBuffer::bufferFromData(mString)->release();
|
|
mString = allocFromUTF32(other, len);
|
|
if (mString) return NO_ERROR;
|
|
|
|
mString = getEmptyString();
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
status_t String8::append(const String8& other)
|
|
{
|
|
const size_t otherLen = other.bytes();
|
|
if (bytes() == 0) {
|
|
setTo(other);
|
|
return NO_ERROR;
|
|
} else if (otherLen == 0) {
|
|
return NO_ERROR;
|
|
}
|
|
|
|
return real_append(other.string(), otherLen);
|
|
}
|
|
|
|
status_t String8::append(const char* other)
|
|
{
|
|
return append(other, strlen(other));
|
|
}
|
|
|
|
status_t String8::append(const char* other, size_t otherLen)
|
|
{
|
|
if (bytes() == 0) {
|
|
return setTo(other, otherLen);
|
|
} else if (otherLen == 0) {
|
|
return NO_ERROR;
|
|
}
|
|
|
|
return real_append(other, otherLen);
|
|
}
|
|
|
|
status_t String8::real_append(const char* other, size_t otherLen)
|
|
{
|
|
const size_t myLen = bytes();
|
|
|
|
SharedBuffer* buf = SharedBuffer::bufferFromData(mString)
|
|
->editResize(myLen+otherLen+1);
|
|
if (buf) {
|
|
char* str = (char*)buf->data();
|
|
mString = str;
|
|
str += myLen;
|
|
memcpy(str, other, otherLen);
|
|
str[otherLen] = '\0';
|
|
return NO_ERROR;
|
|
}
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
char* String8::lockBuffer(size_t size)
|
|
{
|
|
SharedBuffer* buf = SharedBuffer::bufferFromData(mString)
|
|
->editResize(size+1);
|
|
if (buf) {
|
|
char* str = (char*)buf->data();
|
|
mString = str;
|
|
return str;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void String8::unlockBuffer()
|
|
{
|
|
unlockBuffer(strlen(mString));
|
|
}
|
|
|
|
status_t String8::unlockBuffer(size_t size)
|
|
{
|
|
if (size != this->size()) {
|
|
SharedBuffer* buf = SharedBuffer::bufferFromData(mString)
|
|
->editResize(size+1);
|
|
if (buf) {
|
|
char* str = (char*)buf->data();
|
|
str[size] = 0;
|
|
mString = str;
|
|
return NO_ERROR;
|
|
}
|
|
}
|
|
|
|
return NO_MEMORY;
|
|
}
|
|
|
|
ssize_t String8::find(const char* other, size_t start) const
|
|
{
|
|
size_t len = size();
|
|
if (start >= len) {
|
|
return -1;
|
|
}
|
|
const char* s = mString+start;
|
|
const char* p = strstr(s, other);
|
|
return p ? p-mString : -1;
|
|
}
|
|
|
|
void String8::toLower()
|
|
{
|
|
toLower(0, size());
|
|
}
|
|
|
|
void String8::toLower(size_t start, size_t length)
|
|
{
|
|
const size_t len = size();
|
|
if (start >= len) {
|
|
return;
|
|
}
|
|
if (start+length > len) {
|
|
length = len-start;
|
|
}
|
|
char* buf = lockBuffer(len);
|
|
buf += start;
|
|
while (length > 0) {
|
|
*buf = tolower(*buf);
|
|
buf++;
|
|
length--;
|
|
}
|
|
unlockBuffer(len);
|
|
}
|
|
|
|
void String8::toUpper()
|
|
{
|
|
toUpper(0, size());
|
|
}
|
|
|
|
void String8::toUpper(size_t start, size_t length)
|
|
{
|
|
const size_t len = size();
|
|
if (start >= len) {
|
|
return;
|
|
}
|
|
if (start+length > len) {
|
|
length = len-start;
|
|
}
|
|
char* buf = lockBuffer(len);
|
|
buf += start;
|
|
while (length > 0) {
|
|
*buf = toupper(*buf);
|
|
buf++;
|
|
length--;
|
|
}
|
|
unlockBuffer(len);
|
|
}
|
|
|
|
size_t String8::getUtf32Length() const
|
|
{
|
|
return utf32_length(mString, length());
|
|
}
|
|
|
|
int32_t String8::getUtf32At(size_t index, size_t *next_index) const
|
|
{
|
|
return utf32_at(mString, length(), index, next_index);
|
|
}
|
|
|
|
size_t String8::getUtf32(char32_t* dst, size_t dst_len) const
|
|
{
|
|
return utf8_to_utf32(mString, length(), dst, dst_len);
|
|
}
|
|
|
|
TextOutput& operator<<(TextOutput& to, const String8& val)
|
|
{
|
|
to << val.string();
|
|
return to;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Path functions
|
|
|
|
void String8::setPathName(const char* name)
|
|
{
|
|
setPathName(name, strlen(name));
|
|
}
|
|
|
|
void String8::setPathName(const char* name, size_t len)
|
|
{
|
|
char* buf = lockBuffer(len);
|
|
|
|
memcpy(buf, name, len);
|
|
|
|
// remove trailing path separator, if present
|
|
if (len > 0 && buf[len-1] == OS_PATH_SEPARATOR)
|
|
len--;
|
|
|
|
buf[len] = '\0';
|
|
|
|
unlockBuffer(len);
|
|
}
|
|
|
|
String8 String8::getPathLeaf(void) const
|
|
{
|
|
const char* cp;
|
|
const char*const buf = mString;
|
|
|
|
cp = strrchr(buf, OS_PATH_SEPARATOR);
|
|
if (cp == NULL)
|
|
return String8(*this);
|
|
else
|
|
return String8(cp+1);
|
|
}
|
|
|
|
String8 String8::getPathDir(void) const
|
|
{
|
|
const char* cp;
|
|
const char*const str = mString;
|
|
|
|
cp = strrchr(str, OS_PATH_SEPARATOR);
|
|
if (cp == NULL)
|
|
return String8("");
|
|
else
|
|
return String8(str, cp - str);
|
|
}
|
|
|
|
String8 String8::walkPath(String8* outRemains) const
|
|
{
|
|
const char* cp;
|
|
const char*const str = mString;
|
|
const char* buf = str;
|
|
|
|
cp = strchr(buf, OS_PATH_SEPARATOR);
|
|
if (cp == buf) {
|
|
// don't include a leading '/'.
|
|
buf = buf+1;
|
|
cp = strchr(buf, OS_PATH_SEPARATOR);
|
|
}
|
|
|
|
if (cp == NULL) {
|
|
String8 res = buf != str ? String8(buf) : *this;
|
|
if (outRemains) *outRemains = String8("");
|
|
return res;
|
|
}
|
|
|
|
String8 res(buf, cp-buf);
|
|
if (outRemains) *outRemains = String8(cp+1);
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Helper function for finding the start of an extension in a pathname.
|
|
*
|
|
* Returns a pointer inside mString, or NULL if no extension was found.
|
|
*/
|
|
char* String8::find_extension(void) const
|
|
{
|
|
const char* lastSlash;
|
|
const char* lastDot;
|
|
int extLen;
|
|
const char* const str = mString;
|
|
|
|
// only look at the filename
|
|
lastSlash = strrchr(str, OS_PATH_SEPARATOR);
|
|
if (lastSlash == NULL)
|
|
lastSlash = str;
|
|
else
|
|
lastSlash++;
|
|
|
|
// find the last dot
|
|
lastDot = strrchr(lastSlash, '.');
|
|
if (lastDot == NULL)
|
|
return NULL;
|
|
|
|
// looks good, ship it
|
|
return const_cast<char*>(lastDot);
|
|
}
|
|
|
|
String8 String8::getPathExtension(void) const
|
|
{
|
|
char* ext;
|
|
|
|
ext = find_extension();
|
|
if (ext != NULL)
|
|
return String8(ext);
|
|
else
|
|
return String8("");
|
|
}
|
|
|
|
String8 String8::getBasePath(void) const
|
|
{
|
|
char* ext;
|
|
const char* const str = mString;
|
|
|
|
ext = find_extension();
|
|
if (ext == NULL)
|
|
return String8(*this);
|
|
else
|
|
return String8(str, ext - str);
|
|
}
|
|
|
|
String8& String8::appendPath(const char* name)
|
|
{
|
|
// TODO: The test below will fail for Win32 paths. Fix later or ignore.
|
|
if (name[0] != OS_PATH_SEPARATOR) {
|
|
if (*name == '\0') {
|
|
// nothing to do
|
|
return *this;
|
|
}
|
|
|
|
size_t len = length();
|
|
if (len == 0) {
|
|
// no existing filename, just use the new one
|
|
setPathName(name);
|
|
return *this;
|
|
}
|
|
|
|
// make room for oldPath + '/' + newPath
|
|
int newlen = strlen(name);
|
|
|
|
char* buf = lockBuffer(len+1+newlen);
|
|
|
|
// insert a '/' if needed
|
|
if (buf[len-1] != OS_PATH_SEPARATOR)
|
|
buf[len++] = OS_PATH_SEPARATOR;
|
|
|
|
memcpy(buf+len, name, newlen+1);
|
|
len += newlen;
|
|
|
|
unlockBuffer(len);
|
|
|
|
return *this;
|
|
} else {
|
|
setPathName(name);
|
|
return *this;
|
|
}
|
|
}
|
|
|
|
String8& String8::convertToResPath()
|
|
{
|
|
#if OS_PATH_SEPARATOR != RES_PATH_SEPARATOR
|
|
size_t len = length();
|
|
if (len > 0) {
|
|
char * buf = lockBuffer(len);
|
|
for (char * end = buf + len; buf < end; ++buf) {
|
|
if (*buf == OS_PATH_SEPARATOR)
|
|
*buf = RES_PATH_SEPARATOR;
|
|
}
|
|
unlockBuffer(len);
|
|
}
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
}; // namespace android
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
size_t strlen32(const char32_t *s)
|
|
{
|
|
const char32_t *ss = s;
|
|
while ( *ss )
|
|
ss++;
|
|
return ss-s;
|
|
}
|
|
|
|
size_t strnlen32(const char32_t *s, size_t maxlen)
|
|
{
|
|
const char32_t *ss = s;
|
|
while ((maxlen > 0) && *ss) {
|
|
ss++;
|
|
maxlen--;
|
|
}
|
|
return ss-s;
|
|
}
|
|
|
|
size_t utf8_length(const char *src)
|
|
{
|
|
const char *cur = src;
|
|
size_t ret = 0;
|
|
while (*cur != '\0') {
|
|
const char first_char = *cur++;
|
|
if ((first_char & 0x80) == 0) { // ASCII
|
|
ret += 1;
|
|
continue;
|
|
}
|
|
// (UTF-8's character must not be like 10xxxxxx,
|
|
// but 110xxxxx, 1110xxxx, ... or 1111110x)
|
|
if ((first_char & 0x40) == 0) {
|
|
return 0;
|
|
}
|
|
|
|
int32_t mask, to_ignore_mask;
|
|
size_t num_to_read = 0;
|
|
char32_t utf32 = 0;
|
|
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
|
|
num_to_read < 5 && (first_char & mask);
|
|
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
|
if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
|
|
return 0;
|
|
}
|
|
// 0x3F == 00111111
|
|
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
|
}
|
|
// "first_char" must be (110xxxxx - 11110xxx)
|
|
if (num_to_read == 5) {
|
|
return 0;
|
|
}
|
|
to_ignore_mask |= mask;
|
|
utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
|
|
if (utf32 > android::kUnicodeMaxCodepoint) {
|
|
return 0;
|
|
}
|
|
|
|
ret += num_to_read;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t utf32_length(const char *src, size_t src_len)
|
|
{
|
|
if (src == NULL || src_len == 0) {
|
|
return 0;
|
|
}
|
|
size_t ret = 0;
|
|
const char* cur;
|
|
const char* end;
|
|
size_t num_to_skip;
|
|
for (cur = src, end = src + src_len, num_to_skip = 1;
|
|
cur < end;
|
|
cur += num_to_skip, ret++) {
|
|
const char first_char = *cur;
|
|
num_to_skip = 1;
|
|
if ((first_char & 0x80) == 0) { // ASCII
|
|
continue;
|
|
}
|
|
int32_t mask;
|
|
|
|
for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t utf8_length_from_utf32(const char32_t *src, size_t src_len)
|
|
{
|
|
if (src == NULL || src_len == 0) {
|
|
return 0;
|
|
}
|
|
size_t ret = 0;
|
|
const char32_t *end = src + src_len;
|
|
while (src < end) {
|
|
ret += android::utf32_to_utf8_bytes(*src++);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
|
|
{
|
|
if (src == NULL || src_len == 0) {
|
|
return 0;
|
|
}
|
|
size_t ret = 0;
|
|
const char16_t* const end = src + src_len;
|
|
while (src < end) {
|
|
if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
|
|
&& (*++src & 0xFC00) == 0xDC00) {
|
|
// surrogate pairs are always 4 bytes.
|
|
ret += 4;
|
|
src++;
|
|
} else {
|
|
ret += android::utf32_to_utf8_bytes((char32_t) *src++);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int32_t utf32_at_internal(const char* cur, size_t *num_read)
|
|
{
|
|
const char first_char = *cur;
|
|
if ((first_char & 0x80) == 0) { // ASCII
|
|
*num_read = 1;
|
|
return *cur;
|
|
}
|
|
cur++;
|
|
char32_t mask, to_ignore_mask;
|
|
size_t num_to_read = 0;
|
|
char32_t utf32 = first_char;
|
|
for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
|
|
(first_char & mask);
|
|
num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
|
|
// 0x3F == 00111111
|
|
utf32 = (utf32 << 6) + (*cur++ & 0x3F);
|
|
}
|
|
to_ignore_mask |= mask;
|
|
utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
|
|
|
|
*num_read = num_to_read;
|
|
return static_cast<int32_t>(utf32);
|
|
}
|
|
|
|
int32_t utf32_at(const char *src, size_t src_len,
|
|
size_t index, size_t *next_index)
|
|
{
|
|
if (index >= src_len) {
|
|
return -1;
|
|
}
|
|
size_t dummy_index;
|
|
if (next_index == NULL) {
|
|
next_index = &dummy_index;
|
|
}
|
|
size_t num_read;
|
|
int32_t ret = utf32_at_internal(src + index, &num_read);
|
|
if (ret >= 0) {
|
|
*next_index = index + num_read;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
size_t utf8_to_utf32(const char* src, size_t src_len,
|
|
char32_t* dst, size_t dst_len)
|
|
{
|
|
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
|
return 0;
|
|
}
|
|
|
|
const char* cur = src;
|
|
const char* end = src + src_len;
|
|
char32_t* cur_utf32 = dst;
|
|
const char32_t* end_utf32 = dst + dst_len;
|
|
while (cur_utf32 < end_utf32 && cur < end) {
|
|
size_t num_read;
|
|
*cur_utf32++ =
|
|
static_cast<char32_t>(utf32_at_internal(cur, &num_read));
|
|
cur += num_read;
|
|
}
|
|
if (cur_utf32 < end_utf32) {
|
|
*cur_utf32 = 0;
|
|
}
|
|
return static_cast<size_t>(cur_utf32 - dst);
|
|
}
|
|
|
|
size_t utf32_to_utf8(const char32_t* src, size_t src_len,
|
|
char* dst, size_t dst_len)
|
|
{
|
|
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
|
return 0;
|
|
}
|
|
const char32_t *cur_utf32 = src;
|
|
const char32_t *end_utf32 = src + src_len;
|
|
char *cur = dst;
|
|
const char *end = dst + dst_len;
|
|
while (cur_utf32 < end_utf32 && cur < end) {
|
|
size_t len = android::utf32_to_utf8_bytes(*cur_utf32);
|
|
android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len);
|
|
cur += len;
|
|
}
|
|
if (cur < end) {
|
|
*cur = '\0';
|
|
}
|
|
return cur - dst;
|
|
}
|
|
|
|
size_t utf16_to_utf8(const char16_t* src, size_t src_len,
|
|
char* dst, size_t dst_len)
|
|
{
|
|
if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
|
|
return 0;
|
|
}
|
|
const char16_t* cur_utf16 = src;
|
|
const char16_t* const end_utf16 = src + src_len;
|
|
char *cur = dst;
|
|
const char* const end = dst + dst_len;
|
|
while (cur_utf16 < end_utf16 && cur < end) {
|
|
char32_t utf32;
|
|
// surrogate pairs
|
|
if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
|
|
utf32 = (*cur_utf16++ - 0xD800) << 10;
|
|
utf32 |= *cur_utf16++ - 0xDC00;
|
|
utf32 += 0x10000;
|
|
} else {
|
|
utf32 = (char32_t) *cur_utf16++;
|
|
}
|
|
size_t len = android::utf32_to_utf8_bytes(utf32);
|
|
android::utf32_to_utf8((uint8_t*)cur, utf32, len);
|
|
cur += len;
|
|
}
|
|
if (cur < end) {
|
|
*cur = '\0';
|
|
}
|
|
return cur - dst;
|
|
}
|