118 lines
3.7 KiB
C++
118 lines
3.7 KiB
C++
// Copyright Digital Mars 2003 - 2009.
|
|
// Authors: Walter Bright, Sean Kelly
|
|
|
|
// Boost Software License - Version 1.0 - August 17th, 2003
|
|
|
|
// Permission is hereby granted, free of charge, to any person or organization
|
|
// obtaining a copy of the software and accompanying documentation covered by
|
|
// this license (the "Software") to use, reproduce, display, distribute,
|
|
// execute, and transmit the Software, and to prepare derivative works of the
|
|
// Software, and to permit third-parties to whom the Software is furnished to
|
|
// do so, all subject to the following:
|
|
|
|
// The copyright notices in the Software and this entire statement, including
|
|
// the above license grant, this restriction and the following disclaimer,
|
|
// must be included in all copies of the Software, in whole or in part, and
|
|
// all derivative works of the Software, unless such copies or derivative
|
|
// works are solely in the form of machine-executable object code generated by
|
|
// a source language processor.
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
|
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
|
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
// DEALINGS IN THE SOFTWARE.
|
|
|
|
|
|
#include <stdexcept>
|
|
#include <cassert>
|
|
#include "Unicode.h"
|
|
|
|
/***************
|
|
* Decodes and returns character starting at s[idx]. idx is advanced past the
|
|
* decoded character. If the character is not well formed, an exception is
|
|
* thrown and idx remains unchanged.
|
|
*/
|
|
|
|
char32_t decodeUtf8(const char* s, size_t len, size_t& idx)
|
|
{
|
|
assert(idx < len);
|
|
|
|
char32_t V;
|
|
size_t i = idx;
|
|
char u = s[i];
|
|
|
|
if (u & 0x80)
|
|
{
|
|
size_t n;
|
|
char u2;
|
|
|
|
/* The following encodings are valid, except for the 5 and 6 byte
|
|
* combinations:
|
|
* 0xxxxxxx
|
|
* 110xxxxx 10xxxxxx
|
|
* 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
for (n = 1; ; n++)
|
|
{
|
|
if (n > 4)
|
|
goto Lerr; // only do the first 4 of 6 encodings
|
|
if (((u << n) & 0x80) == 0)
|
|
{
|
|
if (n == 1)
|
|
goto Lerr;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Pick off (7 - n) significant bits of B from first byte of octet
|
|
V = static_cast<char32_t>(u & ((1 << (7 - n)) - 1));
|
|
|
|
if (i + (n - 1) >= len)
|
|
goto Lerr; // off end of string
|
|
|
|
/* The following combinations are overlong, and illegal:
|
|
* 1100000x (10xxxxxx)
|
|
* 11100000 100xxxxx (10xxxxxx)
|
|
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
|
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
|
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
*/
|
|
u2 = s[i + 1];
|
|
if ((u & 0xFE) == 0xC0 ||
|
|
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
|
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
|
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
|
(u == 0xFC && (u2 & 0xFC) == 0x80))
|
|
goto Lerr; // overlong combination
|
|
|
|
for (size_t j = 1; j != n; j++)
|
|
{
|
|
u = s[i + j];
|
|
if ((u & 0xC0) != 0x80)
|
|
goto Lerr; // trailing bytes are 10xxxxxx
|
|
V = (V << 6) | (u & 0x3F);
|
|
}
|
|
if (!isValidChar32(V))
|
|
goto Lerr;
|
|
i += n;
|
|
}
|
|
else
|
|
{
|
|
V = static_cast<char32_t>(u);
|
|
i++;
|
|
}
|
|
|
|
idx = i;
|
|
assert(isValidChar32(V));
|
|
return V;
|
|
|
|
Lerr:
|
|
throw std::runtime_error("invalid UTF-8 sequence");
|
|
return V; // dummy return
|
|
}
|