Added unicode character support (UTF-8 <-> UTF-16 conversion) to JSON library
This commit is contained in:
parent
29f5d200cb
commit
8f96245d40
1 changed files with 102 additions and 5 deletions
107
lib/json.cpp
107
lib/json.cpp
|
@ -162,9 +162,34 @@ static inline char hex2c(char c) {
|
||||||
return '0';
|
return '0';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string UTF8(uint32_t c){
|
||||||
|
std::string r;
|
||||||
|
if (c <= 0x7F){
|
||||||
|
r.append(1, c);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
if (c <= 0x7FF){
|
||||||
|
r.append(1, 0xC0 | (c >> 6));
|
||||||
|
r.append(1, 0x80 | (c & 0x3F));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
if (c <= 0x7FF){
|
||||||
|
r.append(1, 0xC0 | (c >> 12));
|
||||||
|
r.append(1, 0x80 | ((c >> 6) & 0x3F));
|
||||||
|
r.append(1, 0x80 | (c & 0x3F));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
r.append(1, 0xC0 | (c >> 18));
|
||||||
|
r.append(1, 0x80 | ((c >> 12) & 0x3F));
|
||||||
|
r.append(1, 0x80 | ((c >> 6) & 0x3F));
|
||||||
|
r.append(1, 0x80 | (c & 0x3F));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
static std::string read_string(int separator, std::istream & fromstream) {
|
static std::string read_string(int separator, std::istream & fromstream) {
|
||||||
std::string out;
|
std::string out;
|
||||||
bool escaped = false;
|
bool escaped = false;
|
||||||
|
uint32_t fullChar = 0;
|
||||||
while (fromstream.good()) {
|
while (fromstream.good()) {
|
||||||
char c;
|
char c;
|
||||||
fromstream.get(c);
|
fromstream.get(c);
|
||||||
|
@ -173,6 +198,10 @@ static std::string read_string(int separator, std::istream & fromstream) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (escaped) {
|
if (escaped) {
|
||||||
|
if (fullChar && c != 'u'){
|
||||||
|
out += UTF8(fullChar >> 16);
|
||||||
|
fullChar = 0;
|
||||||
|
}
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'b':
|
case 'b':
|
||||||
out += '\b';
|
out += '\b';
|
||||||
|
@ -189,15 +218,32 @@ static std::string read_string(int separator, std::istream & fromstream) {
|
||||||
case 't':
|
case 't':
|
||||||
out += '\t';
|
out += '\t';
|
||||||
break;
|
break;
|
||||||
|
case 'x':
|
||||||
|
char d1, d2;
|
||||||
|
fromstream.get(d1);
|
||||||
|
fromstream.get(d2);
|
||||||
|
out.append(1, (c2hex(d2) + (c2hex(d1) << 4)));
|
||||||
|
break;
|
||||||
case 'u': {
|
case 'u': {
|
||||||
char d1, d2, d3, d4;
|
char d1, d2, d3, d4;
|
||||||
fromstream.get(d1);
|
fromstream.get(d1);
|
||||||
fromstream.get(d2);
|
fromstream.get(d2);
|
||||||
fromstream.get(d3);
|
fromstream.get(d3);
|
||||||
fromstream.get(d4);
|
fromstream.get(d4);
|
||||||
out.append(1, (c2hex(d4) + (c2hex(d3) << 4)));
|
uint32_t tmpChar = (c2hex(d4) + (c2hex(d3) << 4) + (c2hex(d2) << 8) + (c2hex(d1) << 16));
|
||||||
//We ignore the upper two characters.
|
if (fullChar && (tmpChar < 0xDC00 || tmpChar > 0xDFFF)){
|
||||||
// + (c2hex(d2) << 8) + (c2hex(d1) << 16)
|
//not a low surrogate - handle high surrogate separately!
|
||||||
|
out += UTF8(fullChar >> 16);
|
||||||
|
fullChar = 0;
|
||||||
|
}
|
||||||
|
fullChar |= tmpChar;
|
||||||
|
if (fullChar >= 0xD800 && fullChar <= 0xDBFF){
|
||||||
|
//possibly high surrogate! Read next characters before handling...
|
||||||
|
fullChar <<= 16;//save as high surrogate
|
||||||
|
}else{
|
||||||
|
out += UTF8(fullChar);
|
||||||
|
fullChar = 0;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
@ -206,6 +252,10 @@ static std::string read_string(int separator, std::istream & fromstream) {
|
||||||
}
|
}
|
||||||
escaped = false;
|
escaped = false;
|
||||||
} else {
|
} else {
|
||||||
|
if (fullChar){
|
||||||
|
out += UTF8(fullChar >> 16);
|
||||||
|
fullChar = 0;
|
||||||
|
}
|
||||||
if (c == separator) {
|
if (c == separator) {
|
||||||
return out;
|
return out;
|
||||||
} else {
|
} else {
|
||||||
|
@ -213,13 +263,32 @@ static std::string read_string(int separator, std::istream & fromstream) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (fullChar){
|
||||||
|
out += UTF8(fullChar >> 16);
|
||||||
|
fullChar = 0;
|
||||||
|
}
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string UTF16(uint32_t c){
|
||||||
|
if (c > 0xFFFF){
|
||||||
|
c -= 0x010000;
|
||||||
|
return UTF16(0xD800 + ((c >> 10) & 0x3FF)) + UTF16(0xDC00 + (c & 0x3FF));
|
||||||
|
}else{
|
||||||
|
std::string ret = "\\u";
|
||||||
|
ret += hex2c((c >> 12) & 0xf);
|
||||||
|
ret += hex2c((c >> 8) & 0xf);
|
||||||
|
ret += hex2c((c >> 4) & 0xf);
|
||||||
|
ret += hex2c(c & 0xf);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string string_escape(const std::string val) {
|
static std::string string_escape(const std::string val) {
|
||||||
std::string out = "\"";
|
std::string out = "\"";
|
||||||
for (unsigned int i = 0; i < val.size(); ++i) {
|
for (unsigned int i = 0; i < val.size(); ++i) {
|
||||||
switch (val.data()[i]) {
|
const char & c = val.data()[i];
|
||||||
|
switch (c) {
|
||||||
case '"':
|
case '"':
|
||||||
out += "\\\"";
|
out += "\\\"";
|
||||||
break;
|
break;
|
||||||
|
@ -242,7 +311,35 @@ static std::string string_escape(const std::string val) {
|
||||||
out += "\\t";
|
out += "\\t";
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (val.data()[i] < 32 || val.data()[i] > 126) {
|
if (c < 32 || c > 126) {
|
||||||
|
//we assume our data is UTF-8 encoded internally.
|
||||||
|
//JavaScript expects UTF-16, so if we recognize a valid UTF-8 sequence, we turn it into UTF-16 for JavaScript.
|
||||||
|
//Anything else is escaped as a single character UTF-16 escape.
|
||||||
|
if ((c & 0xC0) == 0xC0){
|
||||||
|
//possible UTF-8 sequence
|
||||||
|
//check for 2-byte sequence
|
||||||
|
if (((c & 0xE0) == 0XC0) && (i+1 < val.size()) && ((val.data()[i+1] & 0xC0) == 0x80)){
|
||||||
|
//valid 2-byte sequence
|
||||||
|
out += UTF16(((c & 0x1F) << 6) | (val.data()[i+1] & 0x3F));
|
||||||
|
i += 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//check for 3-byte sequence
|
||||||
|
if (((c & 0xF0) == 0XE0) && (i+2 < val.size()) && ((val.data()[i+1] & 0xC0) == 0x80) && ((val.data()[i+2] & 0xC0) == 0x80)){
|
||||||
|
//valid 3-byte sequence
|
||||||
|
out += UTF16(((c & 0x1F) << 12) | ((val.data()[i+1] & 0x3F) << 6) | (val.data()[i+2] & 0x3F));
|
||||||
|
i += 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//check for 4-byte sequence
|
||||||
|
if (((c & 0xF8) == 0XF0) && (i+3 < val.size()) && ((val.data()[i+1] & 0xC0) == 0x80) && ((val.data()[i+2] & 0xC0) == 0x80) && ((val.data()[i+3] & 0xC0) == 0x80)){
|
||||||
|
//valid 4-byte sequence
|
||||||
|
out += UTF16(((c & 0x1F) << 18) | ((val.data()[i+1] & 0x3F) << 12) | ((val.data()[i+2] & 0x3F) << 6) | (val.data()[i+3] & 0x3F));
|
||||||
|
i += 3;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//Anything else, we encode as a single UTF-16 character.
|
||||||
out += "\\u00";
|
out += "\\u00";
|
||||||
out += hex2c((val.data()[i] >> 4) & 0xf);
|
out += hex2c((val.data()[i] >> 4) & 0xf);
|
||||||
out += hex2c(val.data()[i] & 0xf);
|
out += hex2c(val.data()[i] & 0xf);
|
||||||
|
|
Loading…
Add table
Reference in a new issue