shithub: choc

ref: 1990930eb1cae6e60296ba6c85a946b063624f48
dir: /textscreen/txt_utf8.c/

View raw version
// Emacs style mode select   -*- C++ -*- 
//-----------------------------------------------------------------------------
//
// Copyright(C) 2012 Simon Howard
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
// 02111-1307, USA.
//

#include <stdlib.h>
#include <string.h>

#include "txt_utf8.h"

// Encode a Unicode character as UTF-8, storing it in the buffer 'p'
// and returning the new, incremented position.

char *TXT_EncodeUTF8(char *p, unsigned int c)
{
    if (c < 0x80)                             // 1 character (ASCII):
    {
        p[0] = c;
        return p + 1;
    }
    else if (c < 0x800)                       // 2 character:
    {
        p[0] = 0xc0 | (c >> 6);
        p[1] = 0x80 | (c & 0x3f);
        return p + 2;
    }
    else if (c < 0x10000)                     // 3 chacater:
    {
        p[0] = 0xe0 | (c >> 12);
        p[1] = 0x80 | ((c >> 6) & 0x3f);
        p[2] = 0x80 | (c & 0x3f);
        return p + 3;
    }
    else if (c < 0x200000)                    // 4 character:
    {
        p[0] = 0xf0 | (c >> 18);
        p[1] = 0x80 | ((c >> 12) & 0x3f);
        p[2] = 0x80 | ((c >> 6) & 0x3f);
        p[3] = 0x80 | (c & 0x3f);
        return p + 4;
    }
    else
    {
        // Too big!

        return p;
    }
}

// Decode UTF-8 character, incrementing *ptr over the decoded bytes.

unsigned int TXT_DecodeUTF8(const char **ptr)
{
    const char *p = *ptr;
    unsigned int c;

    // UTF-8 decode.

    if ((*p & 0x80) == 0)                     // 1 character (ASCII):
    {
        c = *p;
        *ptr += 1;
    }
    else if ((p[0] & 0xe0) == 0xc0            // 2 character:
          && (p[1] & 0xc0) == 0x80)
    {
        c = ((p[0] & 0x1f) << 6)
          |  (p[1] & 0x3f);
        *ptr += 2;
    }
    else if ((p[0] & 0xf0) == 0xe0            // 3 character:
          && (p[1] & 0xc0) == 0x80
          && (p[2] & 0xc0) == 0x80)
    {
        c = ((p[0] & 0x0f) << 12)
          | ((p[1] & 0x3f) << 6)
          |  (p[2] & 0x3f);
        *ptr += 3;
    }
    else if ((p[0] & 0xf8) == 0xf0            // 4 character:
          && (p[1] & 0xc0) == 0x80
          && (p[2] & 0xc0) == 0x80
          && (p[3] & 0xc0) == 0x80)
    {
        c = ((p[0] & 0x07) << 18)
          | ((p[1] & 0x3f) << 12)
          | ((p[2] & 0x3f) << 6)
          |  (p[3] & 0x3f);
        *ptr += 4;
    }
    else
    {
        // Decode failure.
        // Don't bother with 5/6 byte sequences.

        c = 0;
    }

    return c;
}

// Count the number of characters in a UTF-8 string.

unsigned int TXT_UTF8_Strlen(const char *s)
{
    const char *p;
    unsigned int result = 0;
    unsigned int c;

    for (p = s; *p != '\0';)
    {
        c = TXT_DecodeUTF8(&p);

        if (c == 0)
        {
            break;
        }

        ++result;
    }

    return result;
}

// Skip past the first n characters in a UTF-8 string.

char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
{
    unsigned int i;
    const char *p;

    p = s;

    for (i = 0; i < n; ++i)
    {
        if (TXT_DecodeUTF8(&p) == 0)
        {
            break;
        }
    }

    return (char *) p;
}