ref: cfa5956105a71501fc6aa1a583c118ba423975aa
parent: 116b703442e52a5dca635b5050d3d7165d2c9662
author: Timothy B. Terriberry <[email protected]>
date: Mon Aug 12 05:48:32 EDT 2013
Minor UTF-8/UTF-16 cleanups. - Reject 'not a character' values 0xFFFE and 0xFFFF. - Remove some unnecessary string length checks.
--- a/examples/win32utf8.c
+++ b/examples/win32utf8.c
@@ -32,8 +32,9 @@
dst[di++]=(char)(0x80|c0&0x3F);
continue;
}
- else if(c0>=0xD800&&c0<0xDC00&&si+1<len){
+ else if(c0>=0xD800&&c0<0xDC00){
unsigned c1;
+ /*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=_src[si+1];
if(c1>=0xDC00&&c1<0xE000){
unsigned w;
@@ -48,9 +49,9 @@
continue;
}
}
- /*Anything else is either a valid 3-byte sequence, or an invalid
- surrogate pair.
- In the latter case, we just encode the value as a 3-byte
+ /*Anything else is either a valid 3-byte sequence, an invalid surrogate
+ pair, or 'not a character'.
+ In the latter two cases, we just encode the value as a 3-byte
sequence anyway (producing technically invalid UTF-8).
Later error handling will detect the problem, with a better
chance of giving a useful error message.*/
--- a/src/stream.c
+++ b/src/stream.c
@@ -153,8 +153,9 @@
dst[di++]=(wchar_t)c0;
continue;
}
- else if(si+1<len){
+ else{
int c1;
+ /*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=(unsigned char)_src[si+1];
if((c1&0xC0)==0x80){
/*Found at least one continuation byte.*/
@@ -169,8 +170,9 @@
continue;
}
}
- else if(si+2<len){
+ else{
int c2;
+ /*This is safe, because c1 was not 0 and _src is NUL-terminated.*/
c2=(unsigned char)_src[si+2];
if((c2&0xC0)==0x80){
/*Found at least two continuation bytes.*/
@@ -178,16 +180,19 @@
wchar_t w;
/*Start byte says this is a 3-byte sequence.*/
w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F;
- if(w>=0x800U&&(w<0xD800||w>=0xE000)){
- /*This is a 3-byte sequence that is not overlong and not a
- UTF-16 surrogate pair value.*/
+ if(w>=0x800U&&(w<0xD800||w>=0xE000)&&w<0xFFFE){
+ /*This is a 3-byte sequence that is not overlong, not a
+ UTF-16 surrogate pair value, and not a 'not a character'
+ value.*/
dst[di++]=w;
si+=2;
continue;
}
}
- else if(si+3<len){
+ else{
int c3;
+ /*This is safe, because c2 was not 0 and _src is
+ NUL-terminated.*/
c3=(unsigned char)_src[si+3];
if((c3&0xC0)==0x80){
/*Found at least three continuation bytes.*/