shithub: femtolisp

--- a/utf8.c

+++ b/utf8.c

@@ -191,20 +191,6 @@

 	return 0;

-/* charnum => byte offset */

-size_t

-u8_offset(const char *s, size_t charnum)

-{

-	size_t i = 0;

-	while(charnum > 0){

-		if(s[i++] & 0x80)

-			(void)(isutf(s[++i]) || isutf(s[++i]) || ++i);

-		charnum--;

-	}

-	return i;

-}

 /* byte offset => charnum */

 size_t

 u8_charnum(const char *s, size_t offset)

@@ -219,27 +205,7 @@

 	return charnum;

-/* number of characters in NUL-terminated string */

 size_t

-u8_strlen(const char *s)

-{

-	size_t count = 0;

-	size_t i = 0, lasti;

-	while(1) {

-		lasti = i;

-		while(s[i] > 0)

-			i++;

-		count += (i-lasti);

-		if(s[i++] == 0)

-			break;

-		(void)(isutf(s[++i]) || isutf(s[++i]) || ++i);

-		count++;

-	}

-	return count;

-}

-size_t

 u8_strwidth(const char *s)

 	uint32_t ch;

@@ -307,18 +273,6 @@

 	return ch - offsetsFromUTF8[sz-1];

-void

-u8_inc(const char *s, size_t *i)

-{

-	(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));

-}

-void

-u8_dec(const char *s, size_t *i)

-{

-	(void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));

-}

int

 octal_digit(char c)

@@ -347,70 +301,6 @@

 	return c;

-/* assumes that src points to the character after a backslash

-   returns number of input characters processed, 0 if error */

-size_t

-u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest)

-{

-	assert(ssz > 0);

-	uint32_t ch;

-	char digs[10];

-	int dno = 0, ndig;

-	size_t i = 1;

-	char c0 = str[0];

-	if(octal_digit(c0)){

-		i = 0;

-		do{

-			digs[dno++] = str[i++];

-		}while(i < ssz && octal_digit(str[i]) && dno < 3);

-		digs[dno] = '\0';

-		ch = strtol(digs, nil, 8);

-	}else if((c0 == 'x' && (ndig = 2)) || (c0 == 'u' && (ndig = 4)) || (c0 == 'U' && (ndig = 8))){

-		while(i<ssz && hex_digit(str[i]) && dno < ndig)

-			digs[dno++] = str[i++];

-		if(dno == 0)

-			return 0;

-		digs[dno] = '\0';

-		ch = strtol(digs, nil, 16);

-	}else{

-		ch = (uint32_t)read_escape_control_char(c0);

-	}

-	*dest = ch;

-	return i;

-}

-/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8

-   example: u8_unescape(mybuf, 256, "hello\\u220e")

-   note the double backslash is needed if called on a C string literal */

-size_t

-u8_unescape(char *buf, size_t sz, const char *src)

-{

-	size_t c = 0, amt;

-	uint32_t ch;

-	char temp[4];

-	while(*src && c < sz){

-		if(*src == '\\'){

-			src++;

-			amt = u8_read_escape_sequence(src, 1000, &ch);

-		}else{

-			ch = (uint32_t)*src;

-			amt = 1;

-		}

-		src += amt;

-		amt = u8_wc_toutf8(temp, ch);

-		if(amt > sz-c)

-			break;

-		memmove(&buf[c], temp, amt);

-		c += amt;

-	}

-	if(c < sz)

-		buf[c] = '\0';

-	return c;

-}

 static inline int

 buf_put2c(char *buf, const char *src)

@@ -483,25 +373,6 @@

 char *

-u8_strchr(const char *s, uint32_t ch, size_t *charn)

-{

-	size_t i = 0, lasti = 0;

-	uint32_t c;

-	*charn = 0;

-	while(s[i]){

-		c = u8_nextchar(s, &i);

-		if(c == ch){

-			/* it's const for us, but not necessarily the caller */

-			return (char*)&s[lasti];

-		}

-		lasti = i;

-		(*charn)++;

-	}

-	return nil;

-}

-char *

 u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn)

 	size_t i = 0, lasti = 0;

@@ -524,77 +395,6 @@

 		(*charn)++;

 	return nil;

-}

-char *

-u8_memrchr(const char *s, uint32_t ch, size_t sz)

-{

-	size_t i = sz-1, tempi = 0;

-	uint32_t c;

-	if(sz == 0)

-		return nil;

-	while(i && !isutf(s[i]))

-		i--;

-	while(1){

-		tempi = i;

-		c = u8_nextmemchar(s, &tempi);

-		if(c == ch)

-			return (char*)&s[i];

-		if(i == 0)

-			break;

-		tempi = i;

-		u8_dec(s, &i);

-		if(i > tempi)

-			break;

-	}

-	return nil;

-}

-size_t

-u8_vprintf(const char *fmt, va_list ap)

-{

-	size_t cnt, sz, nc, needfree = 0;

-	char *buf, tmp[512];

-	uint32_t *wcs;

-	sz = 512;

-	buf = tmp;

-	cnt = vsnprintf(buf, sz, fmt, ap);

-	if((ssize_t)cnt < 0)

-		return 0;

-	if(cnt >= sz){

-		buf = (char*)malloc(cnt + 1);

-		needfree = 1;

-		vsnprintf(buf, cnt+1, fmt, ap);

-	}

-	wcs = (uint32_t*)malloc((cnt+1) * sizeof(uint32_t));

-	nc = u8_toucs(wcs, cnt+1, buf, cnt);

-	wcs[nc] = 0;

-#if defined(__plan9__)

-	print("%S", (Rune*)wcs);

-#else

-	printf("%ls", (wchar_t*)wcs);

-#endif

-	free(wcs);

-	if(needfree)

-		free(buf);

-	return nc;

-}

-size_t

-u8_printf(const char *fmt, ...)

-{

-	size_t cnt;

-	va_list args;

-	va_start(args, fmt);

-	cnt = u8_vprintf(fmt, args);

-	va_end(args);

-	return cnt;

 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel

--- a/utf8.h

+++ b/utf8.h

@@ -15,9 +15,6 @@

 /* single character to UTF-8, returns # bytes written */

 size_t u8_wc_toutf8(char *dest, uint32_t ch);

-/* character number to byte offset */

-size_t u8_offset(const char *str, size_t charnum);

 /* byte offset to character number */

 size_t u8_charnum(const char *s, size_t offset);

@@ -27,12 +24,6 @@

 /* next character without NUL character terminator */

 uint32_t u8_nextmemchar(const char *s, size_t *i);

-/* move to next character */

-void u8_inc(const char *s, size_t *i);

-/* move to previous character */

-void u8_dec(const char *s, size_t *i);

 /* returns length of next utf-8 sequence */

 size_t u8_seqlen(const char *s);

@@ -44,19 +35,11 @@

 char read_escape_control_char(char c);

-/* assuming src points to the character after a backslash, read an

-   escape sequence, storing the result in dest and returning the number of

-   input characters processed */

-size_t u8_read_escape_sequence(const char *src, size_t ssz, uint32_t *dest);

 /* given a wide character, convert it to an ASCII escape sequence stored in

    buf, where buf is "sz" bytes. returns the number of characters output.

    sz must be at least 3. */

 int u8_escape_wchar(char *buf, size_t sz, uint32_t ch);

-/* convert a string "src" containing escape sequences to UTF-8 */

-size_t u8_unescape(char *buf, size_t sz, const char *src);

 /* convert UTF-8 "src" to escape sequences.

    sz is buf size in bytes. must be at least 12.

@@ -79,27 +62,12 @@

 int octal_digit(char c);

 int hex_digit(char c);

-/* return a pointer to the first occurrence of ch in s, or nil if not

-   found. character index of found character returned in *charn. */

-char *u8_strchr(const char *s, uint32_t ch, size_t *charn);

 /* same as the above, but searches a buffer of a given size instead of

    a NUL-terminated string. */

 char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn);

-char *u8_memrchr(const char *s, uint32_t ch, size_t sz);

-/* count the number of characters in a UTF-8 string */

-size_t u8_strlen(const char *s);

 /* number of columns occupied by a string */

 size_t u8_strwidth(const char *s);

-/* printf where the format string and arguments may be in UTF-8.

-   you can avoid this function and just use ordinary printf() if the current

-   locale is UTF-8. */

-size_t u8_vprintf(const char *fmt, va_list ap);

-size_t u8_printf(const char *fmt, ...);

 /* determine whether a sequence of bytes is valid UTF-8. length is in bytes */

 int u8_isvalid(const char *str, int length);