From 23045bf4b28af5b9def688516d243cc9068ce848 Mon Sep 17 00:00:00 2001 From: Ivica Ico Bukvic <ico@vt.edu> Date: Sat, 15 Dec 2012 16:33:19 -0500 Subject: [PATCH] backported utf-8 support (may require further testing) --- pd/src/g_editor.c | 4 + pd/src/g_rtext.c | 224 +++++++++++++++++++++----------- pd/src/makefile.in | 3 +- pd/src/pd.tk | 11 +- pd/src/s_utf8.c | 310 +++++++++++++++++++++++++++++++++++++++++++++ pd/src/s_utf8.h | 88 +++++++++++++ 6 files changed, 564 insertions(+), 76 deletions(-) create mode 100644 pd/src/s_utf8.c create mode 100644 pd/src/s_utf8.h diff --git a/pd/src/g_editor.c b/pd/src/g_editor.c index f4c0457fd..5c23a7424 100644 --- a/pd/src/g_editor.c +++ b/pd/src/g_editor.c @@ -9,6 +9,7 @@ #include "s_stuff.h" #include "g_magicglass.h" #include "g_canvas.h" +#include "s_utf8.h" /*-- moo --*/ #include "g_undo.h" #include "x_preset.h" #include <string.h> @@ -3490,6 +3491,9 @@ void canvas_key(t_canvas *x, t_symbol *s, int ac, t_atom *av) case 127:gotkeysym = gensym("Delete"); break; default: sprintf(buf, "%c", (int)(av[1].a_w.w_float)); + /*-- moo: assume keynum is a Unicode codepoint; encode as UTF-8 --*/ + char buf[UTF8_MAXBYTES1]; + u8_wc_toutf8_nul(buf, (UCS4)(av[1].a_w.w_float)); gotkeysym = gensym(buf); } } diff --git a/pd/src/g_rtext.c b/pd/src/g_rtext.c index 42daac36d..cf8ff2bd3 100644 --- a/pd/src/g_rtext.c +++ b/pd/src/g_rtext.c @@ -14,6 +14,7 @@ #include "m_imp.h" #include "s_stuff.h" #include "g_canvas.h" +#include "s_utf8.h" #include "t_tk.h" #define LMARGIN 2 @@ -39,10 +40,10 @@ static int last_sel = 0; struct _rtext { - char *x_buf; - int x_bufsize; - int x_selstart; - int x_selend; + char *x_buf; /*-- raw byte string, assumed UTF-8 encoded (moo) --*/ + int x_bufsize; /*-- byte length --*/ + int x_selstart; /*-- byte offset --*/ + int x_selend; /*-- byte offset --*/ int x_active; int x_dragfrom; int x_height; @@ -111,8 +112,30 @@ void rtext_getseltext(t_rtext *x, char **buf, int *bufsize) *bufsize = x->x_selend - x->x_selstart; } +/* convert t_text te_type symbol for use as a Tk tag */ +static t_symbol *rtext_gettype(t_rtext *x) +{ + switch (x->x_text->te_type) + { + case T_TEXT: return gensym("text"); + case T_OBJECT: return gensym("obj"); + case T_MESSAGE: return gensym("msg"); + case T_ATOM: return gensym("atom"); + } + return (&s_); +} + /* LATER deal with tcl-significant characters */ +/* firstone(), lastone() + * + returns byte offset of (first|last) occurrence of 'c' in 's[0..n-1]', or + * -1 if none was found + * + 's' is a raw byte string + * + 'c' is a byte value + * + 'n' is the length (in bytes) of the prefix of 's' to be searched. + * + we could make these functions work on logical characters in utf8 strings, + * but we don't really need to... + */ static int firstone(char *s, int c, int n) { char *s2 = s + n; @@ -149,6 +172,16 @@ static int lastone(char *s, int c, int n) of the entire text in pixels. */ + /*-- moo: + * + some variables from the original version have been renamed + * + variables with a "_b" suffix are raw byte strings, lengths, or offsets + * + variables with a "_c" suffix are logical character lengths or offsets + * (assuming valid UTF-8 encoded byte string in x->x_buf) + * + a fair amount of O(n) computations required to convert between raw byte + * offsets (needed by the C side) and logical character offsets (needed by + * the GUI) + */ + /* LATER get this and sys_vgui to work together properly, breaking up messages as needed. As of now, there's a limit of 1950 characters, imposed by sys_vgui(). */ @@ -167,14 +200,17 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp, if (x) { t_float dispx, dispy; char smallbuf[200], *tempbuf; - int outchars = 0, nlines = 0, ncolumns = 0, + int outchars_b = 0, nlines = 0, ncolumns = 0, pixwide, pixhigh, font, fontwidth, fontheight, findx, findy; int reportedindex = 0; t_canvas *canvas = glist_getcanvas(x->x_glist); - int widthspec = x->x_text->te_width; - int widthlimit = (widthspec ? widthspec : BOXWIDTH); - int inindex = 0; - int selstart = 0, selend = 0; + + int widthspec_c = x->x_text->te_width; + int widthlimit_c = (widthspec_c ? widthspec_c : BOXWIDTH); + int inindex_b = 0; + int inindex_c = 0; + int selstart_b = 0, selend_b = 0; + int x_bufsize_c = u8_charnum(x->x_buf, x->x_bufsize); /* if we're a GOP (the new, "goprect" style) borrow the font size from the inside to preserve the spacing */ if (pd_class(&x->x_text->te_pd) == canvas_class && @@ -189,74 +225,85 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp, if (x->x_bufsize >= 100) tempbuf = (char *)t_getbytes(2 * x->x_bufsize + 1); else tempbuf = smallbuf; - while (x->x_bufsize - inindex > 0) + while (x_bufsize_c - inindex_c > 0) { - int inchars = x->x_bufsize - inindex; - int maxindex = (inchars > widthlimit ? widthlimit : inchars); + int inchars_b = x->x_bufsize - inindex_b; + int inchars_c = x_bufsize_c - inindex_c; + int maxindex_c = (inchars_c > widthlimit_c ? widthlimit_c : inchars_c); + int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c); int eatchar = 1; - int foundit = firstone(x->x_buf + inindex, '\n', maxindex); - if (foundit < 0) + int foundit_b = firstone(x->x_buf + inindex_b, '\n', maxindex_b); + int foundit_c; + if (foundit_b < 0) { - if (inchars > widthlimit) + if (inchars_c > widthlimit_c) { - foundit = lastone(x->x_buf + inindex, ' ', maxindex); - if (foundit < 0) + foundit_b = lastone(x->x_buf + inindex_b, ' ', maxindex_b); + if (foundit_b < 0) { - foundit = maxindex; + foundit_b = maxindex_b; + foundit_c = maxindex_c; eatchar = 0; } + else + foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b); } else { - foundit = inchars; + foundit_b = inchars_b; + foundit_c = inchars_c; eatchar = 0; } } + else + foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b); + if (nlines == findy) { int actualx = (findx < 0 ? 0 : - (findx > foundit ? foundit : findx)); - *indexp = inindex + actualx; + (findx > foundit_c ? foundit_c : findx)); + *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx); reportedindex = 1; } - strncpy(tempbuf+outchars, x->x_buf + inindex, foundit); - if (x->x_selstart >= inindex && - x->x_selstart <= inindex + foundit + eatchar) - selstart = x->x_selstart + outchars - inindex; - if (x->x_selend >= inindex && - x->x_selend <= inindex + foundit + eatchar) - selend = x->x_selend + outchars - inindex; - outchars += foundit; - inindex += (foundit + eatchar); - if (inindex < x->x_bufsize) - tempbuf[outchars++] = '\n'; - if (foundit > ncolumns) - ncolumns = foundit; + strncpy(tempbuf+outchars_b, x->x_buf + inindex_b, foundit_b); + if (x->x_selstart >= inindex_b && + x->x_selstart <= inindex_b + foundit_b + eatchar) + selstart_b = x->x_selstart + outchars_b - inindex_b; + if (x->x_selend >= inindex_b && + x->x_selend <= inindex_b + foundit_b + eatchar) + selend_b = x->x_selend + outchars_b - inindex_b; + outchars_b += foundit_b; + inindex_b += (foundit_b + eatchar); + inindex_c += (foundit_c + eatchar); + if (inindex_b < x->x_bufsize) + tempbuf[outchars_b++] = '\n'; + if (foundit_c > ncolumns) + ncolumns = foundit_c; nlines++; } if (!reportedindex) - *indexp = outchars; + *indexp = outchars_b; dispx = text_xpix(x->x_text, x->x_glist); dispy = text_ypix(x->x_text, x->x_glist); if (nlines < 1) nlines = 1; - if (!widthspec) + if (!widthspec_c) { while (ncolumns < 3) { - tempbuf[outchars++] = ' '; + tempbuf[outchars_b++] = ' '; ncolumns++; } } - else ncolumns = widthspec; + else ncolumns = widthspec_c; pixwide = ncolumns * fontwidth + (LMARGIN + RMARGIN); pixhigh = nlines * fontheight + (TMARGIN + BMARGIN); if (action == SEND_FIRST) { //fprintf(stderr,"canvas=.x%lx %s\n", (t_int)canvas, tempbuf); - sys_vgui("pdtk_text_new .x%lx.c %s %f %f {%.*s} %d %s\n", - canvas, x->x_tag, + sys_vgui("pdtk_text_new .x%lx.c {%s %s text} %f %f {%.*s} %d %s\n", + canvas, x->x_tag, rtext_gettype(x)->s_name, dispx + LMARGIN, dispy + TMARGIN, - outchars, tempbuf, sys_hostfontsize(font), + outchars_b, tempbuf, sys_hostfontsize(font), (glist_isselected(x->x_glist, &x->x_glist->gl_gobj)? "$select_color" : "$text_color")); } @@ -267,7 +314,7 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp, ((t_glist *)(x->x_text))->gl_isgraph, ((t_glist *)(x->x_text))->gl_goprect );*/ sys_vgui("pdtk_text_set .x%lx.c %s {%.*s}\n", - canvas, x->x_tag, outchars, tempbuf); + canvas, x->x_tag, outchars_b, tempbuf); /*if ( pd_class(&x->x_text->te_pd) == canvas_class && ((t_glist *)(x->x_text))->gl_isgraph && (((t_glist *)(x->x_text))->gl_goprect) ) { @@ -279,19 +326,20 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp, pixwide, pixhigh, 0); if (x->x_active) { - if (selend > selstart) + if (selend_b > selstart_b) { sys_vgui(".x%lx.c select from %s %d\n", canvas, - x->x_tag, selstart); + x->x_tag, u8_charnum(x->x_buf, selstart_b)); sys_vgui(".x%lx.c select to %s %d\n", canvas, - x->x_tag, selend + (sys_oldtclversion ? 0 : -1)); + x->x_tag, u8_charnum(x->x_buf, selend_b) + + (sys_oldtclversion ? 0 : -1)); sys_vgui(".x%lx.c focus \"\"\n", canvas); } else { sys_vgui(".x%lx.c select clear\n", canvas); sys_vgui(".x%lx.c icursor %s %d\n", canvas, x->x_tag, - selstart); + u8_charnum(x->x_buf, selstart_b)); sys_vgui(".x%lx.c focus %s\n", canvas, x->x_tag); } } @@ -467,7 +515,7 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym) .... } */ if (x->x_selstart && (x->x_selstart == x->x_selend)) { - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); if (glist_isvisible(glist_getcanvas(x->x_glist))) sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist)); } @@ -476,7 +524,7 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym) else if (n == 127) /* delete */ { if (x->x_selend < x->x_bufsize && (x->x_selstart == x->x_selend)) - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); if (glist_isvisible(glist_getcanvas(x->x_glist))) sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist)); } @@ -491,7 +539,13 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym) /* at Guenter's suggestion, use 'n>31' to test wither a character might be printable in whatever 8-bit character set we find ourselves. */ - if (n == '\n' || (n > 31 && n != 127)) +/*-- moo: + ... but test with "<" rather than "!=" in order to accomodate unicode + codepoints for n (which we get since Tk is sending the "%A" substitution + for bind <Key>), effectively reducing the coverage of this clause to 7 + bits. Case n>127 is covered by the next clause. +*/ + if (n == '\n' || (n > 31 && n < 127)) { newsize = x->x_bufsize+1; x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize); @@ -502,6 +556,19 @@ be printable in whatever 8-bit character set we find ourselves. */ x->x_selstart = x->x_selstart + 1; if (glist_isvisible(glist_getcanvas(x->x_glist))) sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist)); + } + /*--moo: check for unicode codepoints beyond 7-bit ASCII --*/ + else if (n > 127) + { + int ch_nbytes = u8_wc_nbytes(n); + newsize = x->x_bufsize + ch_nbytes; + x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize); + for (i = x->x_bufsize; i > x->x_selstart; i--) + x->x_buf[i] = x->x_buf[i-1]; + x->x_bufsize = newsize; + /*-- moo: assume canvas_key() has encoded keysym as UTF-8 */ + strncpy(x->x_buf+x->x_selstart, keysym->s_name, ch_nbytes); + x->x_selstart = x->x_selstart + ch_nbytes; } x->x_selend = x->x_selstart; x->x_glist->gl_editor->e_textdirty = 1; @@ -509,7 +576,10 @@ be printable in whatever 8-bit character set we find ourselves. */ else if (!strcmp(keysym->s_name, "Right")) { if (x->x_selend == x->x_selstart && x->x_selstart < x->x_bufsize) - x->x_selend = x->x_selstart = x->x_selstart + 1; + { + u8_inc(x->x_buf, &x->x_selstart); + x->x_selend = x->x_selstart; + } else x->x_selstart = x->x_selend; last_sel = 0; @@ -517,7 +587,10 @@ be printable in whatever 8-bit character set we find ourselves. */ else if (!strcmp(keysym->s_name, "Left")) { if (x->x_selend == x->x_selstart && x->x_selstart > 0) - x->x_selend = x->x_selstart = x->x_selstart - 1; + { + u8_dec(x->x_buf, &x->x_selstart); + x->x_selend = x->x_selstart; + } else x->x_selend = x->x_selstart; last_sel = 0; @@ -527,11 +600,11 @@ be printable in whatever 8-bit character set we find ourselves. */ if (!last_sel) last_sel = 2; if (last_sel == 1 && x->x_selstart < x->x_selend) { if (x->x_selstart < x->x_bufsize) - x->x_selstart = x->x_selstart + 1; + u8_inc(x->x_buf, &x->x_selstart); } else { last_sel = 2; if (x->x_selend < x->x_bufsize) - x->x_selend = x->x_selend + 1; + u8_inc(x->x_buf, &x->x_selend); } } else if (!strcmp(keysym->s_name, "ShiftLeft")) @@ -542,16 +615,16 @@ be printable in whatever 8-bit character set we find ourselves. */ } else { last_sel = 1; if (x->x_selstart > 0) - x->x_selstart = x->x_selstart - 1; + u8_dec(x->x_buf, &x->x_selstart); } } /* this should be improved... life's too short */ else if (!strcmp(keysym->s_name, "Up") || !strcmp(keysym->s_name, "Home")) { if (x->x_selstart) - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); while (x->x_selstart > 0 && x->x_buf[x->x_selstart] != '\n') - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); x->x_selend = x->x_selstart; last_sel = 0; } @@ -559,9 +632,9 @@ be printable in whatever 8-bit character set we find ourselves. */ { while (x->x_selend < x->x_bufsize && x->x_buf[x->x_selend] != '\n') - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); if (x->x_selend < x->x_bufsize) - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); x->x_selstart = x->x_selend; last_sel = 0; } @@ -569,31 +642,31 @@ be printable in whatever 8-bit character set we find ourselves. */ { /* first find first non-space char going back */ while (x->x_selstart > 0 && x->x_buf[x->x_selstart-1] == ' ') - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); /* now go back until you find another space or the beginning of the buffer */ while (x->x_selstart > 0 && x->x_buf[x->x_selstart] != '\n' && x->x_buf[x->x_selstart-1] != ' ') - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); if (x->x_buf[x->x_selstart+1] == ' ') - x->x_selstart++; + u8_inc(x->x_buf, &x->x_selstart); x->x_selend = x->x_selstart; } else if (!strcmp(keysym->s_name, "CtrlRight")) { /* now go forward until you find another space or the end of the buffer */ if (x->x_selend < x->x_bufsize - 1) - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); while (x->x_selend < x->x_bufsize && x->x_buf[x->x_selend] != '\n' && x->x_buf[x->x_selend] != ' ') - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); /* now skip all the spaces and land before next word */ while (x->x_selend < x->x_bufsize && x->x_buf[x->x_selend] == ' ') - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); if (x->x_selend > 0 && x->x_buf[x->x_selend-1] == ' ') - x->x_selend--; + u8_dec(x->x_buf, &x->x_selend); x->x_selstart = x->x_selend; } else if (!strcmp(keysym->s_name, "CtrlShiftLeft")) @@ -609,14 +682,17 @@ be printable in whatever 8-bit character set we find ourselves. */ } /* first find first non-space char going back */ while (*target > 0 && x->x_buf[*target-1] == ' ') - (*target)--; + u8_dec(x->x_buf, target); + //(*target)--; /* now go back until you find another space or the beginning of the buffer */ while (*target > 0 && x->x_buf[*target] != '\n' && x->x_buf[*target-1] != ' ') - (*target)--; + u8_dec(x->x_buf, target); + //(*target)--; if (x->x_buf[*target+1] == ' ') - (*target)++; + u8_inc(x->x_buf, target); + //(*target)++; if (x->x_selstart > x->x_selend) { swap = x->x_selend; x->x_selend = x->x_selstart; @@ -637,17 +713,21 @@ be printable in whatever 8-bit character set we find ourselves. */ } /* now go forward until you find another space or the end of the buffer */ if (*target < x->x_bufsize - 1) - (*target)++; + u8_inc(x->x_buf, target); + //(*target)++; while (*target < x->x_bufsize && x->x_buf[*target] != '\n' && x->x_buf[*target] != ' ') - (*target)++; + u8_inc(x->x_buf, target); + //(*target)++; /* now skip all the spaces and land before next word */ while (*target < x->x_bufsize && x->x_buf[*target] == ' ') - (*target)++; + u8_inc(x->x_buf, target); + //(*target)++; if (*target > 0 && x->x_buf[*target-1] == ' ') - (*target)--; + u8_dec(x->x_buf, target); + //(*target)--; if (x->x_selstart > x->x_selend) { swap = x->x_selend; x->x_selend = x->x_selstart; diff --git a/pd/src/makefile.in b/pd/src/makefile.in index 963eeaf27..f289e88dd 100644 --- a/pd/src/makefile.in +++ b/pd/src/makefile.in @@ -66,7 +66,8 @@ OPT_SAFE_SRC = g_canvas.c g_graph.c g_text.c g_rtext.c g_array.c g_template.c g_ m_pd.c m_class.c m_obj.c m_atom.c m_memory.c m_binbuf.c \ m_conf.c m_glob.c m_sched.c \ s_main.c s_inter.c s_file.c s_print.c \ - s_loader.c s_path.c s_entry.c s_audio.c s_midi.c\ + s_loader.c s_path.c s_entry.c s_audio.c s_midi.c \ + s_utf8.c \ d_ugen.c d_arithmetic.c d_dac.c d_misc.c \ d_fft.c d_global.c \ d_resample.c \ diff --git a/pd/src/pd.tk b/pd/src/pd.tk index 10f839b76..4d529aa37 100644 --- a/pd/src/pd.tk +++ b/pd/src/pd.tk @@ -18,6 +18,11 @@ #http://ico.bukvic.net <ico@vt.edu> #puts stderr [info tclversion] +##--moo: force default system and stdio encoding to UTF-8 +encoding system utf-8 +fconfigure stderr -encoding utf-8 +fconfigure stdout -encoding utf-8 +##--/moo if { [info tclversion] >= 8.5 } { @@ -5947,9 +5952,9 @@ proc pdtk_text_new {canvasname myname x y text font color} { # 30 { set typeface [lindex $pd_fontlist 8] } # 36 { set typeface [lindex $pd_fontlist 9] } # } - + #[encoding convertfrom utf-8 $text] $canvasname create text $x $y -font [get_font_for_size $font] \ - -tags $myname -text $text -fill $color -anchor nw + -tags $myname -text [encoding convertfrom utf-8 $text] -fill $color -anchor nw $canvasname addtag text withtag $myname #$canvasname bind $myname <Home> [concat $canvasname icursor $myname 0] #$canvasname bind $myname <End> [concat $canvasname icursor $myname end] @@ -5964,7 +5969,7 @@ proc pdtk_text_new {canvasname myname x y text font color} { ################ pdtk_text_set -- change the text ################## proc pdtk_text_set {canvasname myname text} { - catch {$canvasname itemconfig $myname -text $text} + catch {$canvasname itemconfig $myname -text [encoding convertfrom utf-8 $text]} # pd [concat $myname size [$canvasname bbox $myname] \;] } diff --git a/pd/src/s_utf8.c b/pd/src/s_utf8.c new file mode 100644 index 000000000..a4179e7aa --- /dev/null +++ b/pd/src/s_utf8.c @@ -0,0 +1,310 @@ +/* + Basic UTF-8 manipulation routines + by Jeff Bezanson + placed in the public domain Fall 2005 + + This code is designed to provide the utilities you need to manipulate + UTF-8 as an internal string encoding. These functions do not perform the + error checking normally needed when handling UTF-8 data, so if you happen + to be from the Unicode Consortium you will want to flay me alive. + I do this because error checking can be performed at the boundaries (I/O), + with these routines reserved for higher performance on data known to be + valid. + + modified by Bryan Jurish (moo) March 2009 + + removed some unneeded functions (escapes, printf etc), added others +*/ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <stdarg.h> +#ifdef WIN32 +#include <malloc.h> +#else +#include <alloca.h> +#endif + +#include "s_utf8.h" + +static const u_int32_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL +}; + +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s) +{ + return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; +} + +/* conversions without error checking + only works for valid UTF-8, i.e. no 5- or 6-byte sequences + srcsz = source size in bytes, or -1 if 0-terminated + sz = dest size in # of wide characters + + returns # characters converted + dest will always be L'\0'-terminated, even if there isn't enough room + for all the characters. + if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. +*/ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) +{ + u_int32_t ch; + char *src_end = src + srcsz; + int nb; + int i=0; + + while (i < sz-1) { + nb = trailingBytesForUTF8[(unsigned char)*src]; + if (srcsz == -1) { + if (*src == 0) + goto done_toucs; + } + else { + if (src + nb >= src_end) + goto done_toucs; + } + ch = 0; + switch (nb) { + /* these fall through deliberately */ +#if UTF8_SUPPORT_FULL_UCS4 + case 5: ch += (unsigned char)*src++; ch <<= 6; + case 4: ch += (unsigned char)*src++; ch <<= 6; +#endif + case 3: ch += (unsigned char)*src++; ch <<= 6; + case 2: ch += (unsigned char)*src++; ch <<= 6; + case 1: ch += (unsigned char)*src++; ch <<= 6; + case 0: ch += (unsigned char)*src++; + } + ch -= offsetsFromUTF8[nb]; + dest[i++] = ch; + } + done_toucs: + dest[i] = 0; + return i; +} + +/* srcsz = number of source characters, or -1 if 0-terminated + sz = size of dest buffer in bytes + + returns # characters converted + dest will only be '\0'-terminated if there is enough space. this is + for consistency; imagine there are 2 bytes of space left, but the next + character requires 3 bytes. in this case we could NUL-terminate, but in + general we can't when there's insufficient space. therefore this function + only NUL-terminates if all the characters fit, and there's space for + the NUL as well. + the destination string will never be bigger than the source string. +*/ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +/* moo: get byte length of character number, or 0 if not supported */ +int u8_wc_nbytes(u_int32_t ch) +{ + if (ch < 0x80) return 1; + if (ch < 0x800) return 2; + if (ch < 0x10000) return 3; + if (ch < 0x200000) return 4; +#if UTF8_SUPPORT_FULL_UCS4 + /*-- moo: support full UCS-4 range? --*/ + if (ch < 0x4000000) return 5; + if (ch < 0x7fffffffUL) return 6; +#endif + return 0; /*-- bad input --*/ +} + +int u8_wc_toutf8(char *dest, u_int32_t ch) +{ + if (ch < 0x80) { + dest[0] = (char)ch; + return 1; + } + if (ch < 0x800) { + dest[0] = (ch>>6) | 0xC0; + dest[1] = (ch & 0x3F) | 0x80; + return 2; + } + if (ch < 0x10000) { + dest[0] = (ch>>12) | 0xE0; + dest[1] = ((ch>>6) & 0x3F) | 0x80; + dest[2] = (ch & 0x3F) | 0x80; + return 3; + } + if (ch < 0x110000) { + dest[0] = (ch>>18) | 0xF0; + dest[1] = ((ch>>12) & 0x3F) | 0x80; + dest[2] = ((ch>>6) & 0x3F) | 0x80; + dest[3] = (ch & 0x3F) | 0x80; + return 4; + } + return 0; +} + +/*-- moo --*/ +int u8_wc_toutf8_nul(char *dest, u_int32_t ch) +{ + int sz = u8_wc_toutf8(dest,ch); + dest[sz] = '\0'; + return sz; +} + +/* charnum => byte offset */ +int u8_offset(char *str, int charnum) +{ + char *string = str; + + while (charnum > 0 && *string != '\0') { + if (*string++ & 0x80) { + if (!isutf(*string)) { + ++string; + if (!isutf(*string)) { + ++string; + if (!isutf(*string)) { + ++string; + } + } + } + } + --charnum; + } + + return (int)(string - str); +} + +/* byte offset => charnum */ +int u8_charnum(char *s, int offset) +{ + int charnum = 0; + char *string = s; + char *const end = string + offset; + + while (string < end && *string != '\0') { + if (*string++ & 0x80) { + if (!isutf(*string)) { + ++string; + if (!isutf(*string)) { + ++string; + if (!isutf(*string)) { + ++string; + } + } + } + } + ++charnum; + } + return charnum; +} + +/* reads the next utf-8 sequence out of a string, updating an index */ +u_int32_t u8_nextchar(char *s, int *i) +{ + u_int32_t ch = 0; + int sz = 0; + + do { + ch <<= 6; + ch += (unsigned char)s[(*i)++]; + sz++; + } while (s[*i] && !isutf(s[*i])); + ch -= offsetsFromUTF8[sz-1]; + + return ch; +} + +/* number of characters */ +int u8_strlen(char *s) +{ + int count = 0; + int i = 0; + + while (u8_nextchar(s, &i) != 0) + count++; + + return count; +} + +void u8_inc(char *s, int *i) +{ + if (s[(*i)++] & 0x80) { + if (!isutf(s[*i])) { + ++(*i); + if (!isutf(s[*i])) { + ++(*i); + if (!isutf(s[*i])) { + ++(*i); + } + } + } + } +} + +void u8_dec(char *s, int *i) +{ + (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || + isutf(s[--(*i)]) || --(*i)); +} + +/*-- moo --*/ +void u8_inc_ptr(char **sp) +{ + (void)(isutf(*(++(*sp))) || isutf(*(++(*sp))) || + isutf(*(++(*sp))) || ++(*sp)); +} + +/*-- moo --*/ +void u8_dec_ptr(char **sp) +{ + (void)(isutf(*(--(*sp))) || isutf(*(--(*sp))) || + isutf(*(--(*sp))) || --(*sp)); +} diff --git a/pd/src/s_utf8.h b/pd/src/s_utf8.h new file mode 100644 index 000000000..56c40d480 --- /dev/null +++ b/pd/src/s_utf8.h @@ -0,0 +1,88 @@ +#ifndef S_UTF8_H +#define S_UTF8_H + +/*--moo--*/ +#ifndef u_int32_t +# define u_int32_t unsigned int +#endif + +#ifndef UCS4 +# define UCS4 u_int32_t +#endif + +/* UTF8_SUPPORT_FULL_UCS4 + * define this to support the full potential range of UCS-4 codepoints + * (in anticipation of a future UTF-8 standard) + */ +/*#define UTF8_SUPPORT_FULL_UCS4 1*/ +#undef UTF8_SUPPORT_FULL_UCS4 + +/* UTF8_MAXBYTES + * maximum number of bytes required to represent a single character in UTF-8 + * + * UTF8_MAXBYTES1 = UTF8_MAXBYTES+1 + * maximum bytes per character including NUL terminator + */ +#ifdef UTF8_SUPPORT_FULL_UCS4 +# ifndef UTF8_MAXBYTES +# define UTF8_MAXBYTES 6 +# endif +# ifndef UTF8_MAXBYTES1 +# define UTF8_MAXBYTES1 7 +# endif +#else +# ifndef UTF8_MAXBYTES +# define UTF8_MAXBYTES 4 +# endif +# ifndef UTF8_MAXBYTES1 +# define UTF8_MAXBYTES1 5 +# endif +#endif +/*--/moo--*/ + +/* is c the start of a utf8 sequence? */ +#define isutf(c) (((c)&0xC0)!=0x80) + +/* convert UTF-8 data to wide character */ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); + +/* the opposite conversion */ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); + +/* moo: get byte length of character number, or 0 if not supported */ +int u8_wc_nbytes(u_int32_t ch); + +/* moo: compute required storage for UTF-8 encoding of 's[0..n-1]' */ +int u8_wcs_nbytes(u_int32_t *ucs, int size); + +/* single character to UTF-8, no NUL termination */ +int u8_wc_toutf8(char *dest, u_int32_t ch); + +/* moo: single character to UTF-8, with NUL termination */ +int u8_wc_toutf8_nul(char *dest, u_int32_t ch); + +/* character number to byte offset */ +int u8_offset(char *str, int charnum); + +/* byte offset to character number */ +int u8_charnum(char *s, int offset); + +/* return next character, updating an index variable */ +u_int32_t u8_nextchar(char *s, int *i); + +/* move to next character */ +void u8_inc(char *s, int *i); + +/* move to previous character */ +void u8_dec(char *s, int *i); + +/* moo: move pointer to next character */ +void u8_inc_ptr(char **sp); + +/* moo: move pointer to previous character */ +void u8_dec_ptr(char **sp); + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s); + +#endif /* S_UTF8_H */ -- GitLab