s_utf8.c 9.38 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
  Basic UTF-8 manipulation routines
  by Jeff Bezanson
  placed in the public domain Fall 2005

  This code is designed to provide the utilities you need to manipulate
  UTF-8 as an internal string encoding. These functions do not perform the
  error checking normally needed when handling UTF-8 data, so if you happen
  to be from the Unicode Consortium you will want to flay me alive.
  I do this because error checking can be performed at the boundaries (I/O),
  with these routines reserved for higher performance on data known to be
  valid.

  modified by Bryan Jurish (moo) March 2009
  + removed some unneeded functions (escapes, printf etc), added others
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
21 22 23 24
#ifdef _WIN32
# include <malloc.h> /* MSVC or mingw on windows */
#elif defined(__linux__) || defined(__APPLE__)
# include <alloca.h> /* linux, mac, mingw, cygwin */
25
#else
26
# include <stdlib.h> /* BSDs for example */
27 28 29 30
#endif

#include "s_utf8.h"

31
static const uint32_t offsetsFromUTF8[6] = {
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

static const char trailingBytesForUTF8[256] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};


/* returns length of next utf-8 sequence */
int u8_seqlen(char *s)
{
    return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
}

/* conversions without error checking
   only works for valid UTF-8, i.e. no 5- or 6-byte sequences
   srcsz = source size in bytes, or -1 if 0-terminated
   sz = dest size in # of wide characters

   returns # characters converted
   dest will always be L'\0'-terminated, even if there isn't enough room
   for all the characters.
   if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
*/
64
int u8_utf8toucs2(uint16_t *dest, int sz, char *src, int srcsz)
65
{
66
    uint16_t ch;
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    char *src_end = src + srcsz;
    int nb;
    int i=0;

    while (i < sz-1) {
        nb = trailingBytesForUTF8[(unsigned char)*src];
        if (srcsz == -1) {
            if (*src == 0)
                goto done_toucs;
        }
        else {
            if (src + nb >= src_end)
                goto done_toucs;
        }
        ch = 0;
        switch (nb) {
83 84 85
            /* these fall through deliberately, but commenting each explicitly
               seems to quiet the compiler. If that's not future proof we
               can just use copy/pasta and add the break statements */
86
        case 3: ch += (unsigned char)*src++; ch <<= 6;
87
                /* fall through */
88
        case 2: ch += (unsigned char)*src++; ch <<= 6;
89
                /* fall through */
90
        case 1: ch += (unsigned char)*src++; ch <<= 6;
91
                /* fall through */
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
        case 0: ch += (unsigned char)*src++;
        }
        ch -= offsetsFromUTF8[nb];
        dest[i++] = ch;
    }
 done_toucs:
    dest[i] = 0;
    return i;
}

/* srcsz = number of source characters, or -1 if 0-terminated
   sz = size of dest buffer in bytes

   returns # characters converted
   dest will only be '\0'-terminated if there is enough space. this is
   for consistency; imagine there are 2 bytes of space left, but the next
   character requires 3 bytes. in this case we could NUL-terminate, but in
   general we can't when there's insufficient space. therefore this function
   only NUL-terminates if all the characters fit, and there's space for
   the NUL as well.
   the destination string will never be bigger than the source string.
*/
114
int u8_ucs2toutf8(char *dest, int sz, uint16_t *src, int srcsz)
115
{
116
    uint16_t ch;
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    int i = 0;
    char *dest_end = dest + sz;

    while (srcsz<0 ? src[i]!=0 : i < srcsz) {
        ch = src[i];
        if (ch < 0x80) {
            if (dest >= dest_end)
                return i;
            *dest++ = (char)ch;
        }
        else if (ch < 0x800) {
            if (dest >= dest_end-1)
                return i;
            *dest++ = (ch>>6) | 0xC0;
            *dest++ = (ch & 0x3F) | 0x80;
        }
133
        else {
134 135 136 137 138 139 140 141 142 143 144 145 146 147
            if (dest >= dest_end-2)
                return i;
            *dest++ = (ch>>12) | 0xE0;
            *dest++ = ((ch>>6) & 0x3F) | 0x80;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        i++;
    }
    if (dest < dest_end)
        *dest = '\0';
    return i;
}

/* moo: get byte length of character number, or 0 if not supported */
148
int u8_wc_nbytes(uint32_t ch)
149 150 151 152 153 154 155 156
{
  if (ch < 0x80) return 1;
  if (ch < 0x800) return 2;
  if (ch < 0x10000) return 3;
  if (ch < 0x200000) return 4;
  return 0; /*-- bad input --*/
}

157
int u8_wc_toutf8(char *dest, uint32_t ch)
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
{
    if (ch < 0x80) {
        dest[0] = (char)ch;
        return 1;
    }
    if (ch < 0x800) {
        dest[0] = (ch>>6) | 0xC0;
        dest[1] = (ch & 0x3F) | 0x80;
        return 2;
    }
    if (ch < 0x10000) {
        dest[0] = (ch>>12) | 0xE0;
        dest[1] = ((ch>>6) & 0x3F) | 0x80;
        dest[2] = (ch & 0x3F) | 0x80;
        return 3;
    }
    if (ch < 0x110000) {
        dest[0] = (ch>>18) | 0xF0;
        dest[1] = ((ch>>12) & 0x3F) | 0x80;
        dest[2] = ((ch>>6) & 0x3F) | 0x80;
        dest[3] = (ch & 0x3F) | 0x80;
        return 4;
    }
    return 0;
}

/*-- moo --*/
185
int u8_wc_toutf8_nul(char *dest, uint32_t ch)
186 187 188 189 190 191 192
{
  int sz = u8_wc_toutf8(dest,ch);
  dest[sz] = '\0';
  return sz;
}

/* charnum => byte offset */
193
int u8_offset(char *str, int charnum, int bufsize)
194
{
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
    /* Implementation #1 of a tricky encoding in an unsafe language. It
       assumes that we're dealing with null-terminated strings, but
       x_buf of rtext isn't null-terminated. */
    /*
    int offs=0;

    while (charnum > 0 && str[offs]) {
        (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
               isutf(str[++offs]) || ++offs);
        charnum--;
    }
    return offs;
    */

    /* Implementation number 2 apparently tried to fix that. Instead, it
       just made a reimplementation that still potential dereferences
       non-existent pointers _if_ we try to get the offset at the last
       character _and_ that last character happens to be wide. */
    /*
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
    char *string = str;

    while (charnum > 0 && *string != '\0') {
        if (*string++ & 0x80) {
            if (!isutf(*string)) {
                ++string;
                if (!isutf(*string)) {
                    ++string;
                    if (!isutf(*string)) {
                        ++string;
                    }
                }
            }
        }
        --charnum;
    }

    return (int)(string - str);
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
    */

    /* Here is an _extremely_ conservative implementation that protects
       against dereferencing garbage pointers. */
    int offs = 0;
    if (isutf(str[offs]))
    {
        for (offs = 0; offs < bufsize; offs++)
        {
            if (isutf(str[offs]))
            {
                if (charnum <= 0)
                    break;
                charnum -= 1;
            }
        }
    }
    else
    {
        bug("u8_offset");
    }
    return offs;
254 255 256 257 258
}

/* byte offset => charnum */
int u8_charnum(char *s, int offset)
{
259 260 261
    /* This has the same problem as the commented implementations of u8_offset
       above. */
    /*
262 263 264 265 266
    int charnum = 0;
    char *string = s;
    char *const end = string + offset;

    while (string < end && *string != '\0') {
267 268
        if (*string++ & 0x80) {
            if (!isutf(*string)) {
269
                ++string;
270
                if (!isutf(*string)) {
271
                    ++string;
272
                    if (!isutf(*string)) {
273 274 275 276 277 278 279 280
                        ++string;
                    }
                }
            }
        }
        ++charnum;
    }
    return charnum;
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
    */

    /* The original implementation which doesn't work well with
       strings that aren't null terminated */
    /*
    int charnum = 0, offs=0;

    while (offs < offset && s[offs]) {
        (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
               isutf(s[++offs]) || ++offs);
        charnum++;
    }
    return charnum;
    */

    /* An _extremely_ conservative implementation to avoid dereferencing
       garbage. */
    int charnum = 0, i;
    for (i = 0; i < offset; i++)
    {
        if (isutf(s[i])) charnum += 1;
    }
    return charnum;
304 305 306
}

/* reads the next utf-8 sequence out of a string, updating an index */
307
uint32_t u8_nextchar(char *s, int *i)
308
{
309
    uint32_t ch = 0;
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
    int sz = 0;

    do {
        ch <<= 6;
        ch += (unsigned char)s[(*i)++];
        sz++;
    } while (s[*i] && !isutf(s[*i]));
    ch -= offsetsFromUTF8[sz-1];

    return ch;
}

/* number of characters */
int u8_strlen(char *s)
{
    int count = 0;
    int i = 0;

    while (u8_nextchar(s, &i) != 0)
        count++;

    return count;
}

void u8_inc(char *s, int *i)
{
    if (s[(*i)++] & 0x80) {
        if (!isutf(s[*i])) {
            ++(*i);
            if (!isutf(s[*i])) {
                ++(*i);
                if (!isutf(s[*i])) {
                    ++(*i);
                }
            }
        }
    }
}

void u8_dec(char *s, int *i)
{
    (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
           isutf(s[--(*i)]) || --(*i));
}