From 23045bf4b28af5b9def688516d243cc9068ce848 Mon Sep 17 00:00:00 2001
From: Ivica Ico Bukvic <ico@vt.edu>
Date: Sat, 15 Dec 2012 16:33:19 -0500
Subject: [PATCH] backported utf-8 support (may require further testing)

---
 pd/src/g_editor.c  |   4 +
 pd/src/g_rtext.c   | 224 +++++++++++++++++++++-----------
 pd/src/makefile.in |   3 +-
 pd/src/pd.tk       |  11 +-
 pd/src/s_utf8.c    | 310 +++++++++++++++++++++++++++++++++++++++++++++
 pd/src/s_utf8.h    |  88 +++++++++++++
 6 files changed, 564 insertions(+), 76 deletions(-)
 create mode 100644 pd/src/s_utf8.c
 create mode 100644 pd/src/s_utf8.h

diff --git a/pd/src/g_editor.c b/pd/src/g_editor.c
index f4c0457fd..5c23a7424 100644
--- a/pd/src/g_editor.c
+++ b/pd/src/g_editor.c
@@ -9,6 +9,7 @@
 #include "s_stuff.h"
 #include "g_magicglass.h"
 #include "g_canvas.h"
+#include "s_utf8.h" /*-- moo --*/
 #include "g_undo.h"
 #include "x_preset.h"
 #include <string.h>
@@ -3490,6 +3491,9 @@ void canvas_key(t_canvas *x, t_symbol *s, int ac, t_atom *av)
         case 127:gotkeysym = gensym("Delete"); break;
         default:
             sprintf(buf, "%c", (int)(av[1].a_w.w_float));
+			/*-- moo: assume keynum is a Unicode codepoint; encode as UTF-8 --*/
+			char buf[UTF8_MAXBYTES1];
+			u8_wc_toutf8_nul(buf, (UCS4)(av[1].a_w.w_float));
             gotkeysym = gensym(buf);
         }
     }
diff --git a/pd/src/g_rtext.c b/pd/src/g_rtext.c
index 42daac36d..cf8ff2bd3 100644
--- a/pd/src/g_rtext.c
+++ b/pd/src/g_rtext.c
@@ -14,6 +14,7 @@
 #include "m_imp.h"
 #include "s_stuff.h"
 #include "g_canvas.h"
+#include "s_utf8.h"
 #include "t_tk.h"
 
 #define LMARGIN 2
@@ -39,10 +40,10 @@ static int last_sel = 0;
 
 struct _rtext
 {
-    char *x_buf;
-    int x_bufsize;
-    int x_selstart;
-    int x_selend;
+    char *x_buf;    /*-- raw byte string, assumed UTF-8 encoded (moo) --*/
+    int x_bufsize;  /*-- byte length --*/
+    int x_selstart; /*-- byte offset --*/
+    int x_selend;   /*-- byte offset --*/
     int x_active;
     int x_dragfrom;
     int x_height;
@@ -111,8 +112,30 @@ void rtext_getseltext(t_rtext *x, char **buf, int *bufsize)
     *bufsize = x->x_selend - x->x_selstart;
 }
 
+/* convert t_text te_type symbol for use as a Tk tag */
+static t_symbol *rtext_gettype(t_rtext *x)
+{
+    switch (x->x_text->te_type) 
+    {
+    case T_TEXT: return gensym("text");
+    case T_OBJECT: return gensym("obj");
+    case T_MESSAGE: return gensym("msg");
+    case T_ATOM: return gensym("atom");
+    }
+    return (&s_);
+}
+
 /* LATER deal with tcl-significant characters */
 
+/* firstone(), lastone()
+ *  + returns byte offset of (first|last) occurrence of 'c' in 's[0..n-1]', or
+ *    -1 if none was found
+ *  + 's' is a raw byte string
+ *  + 'c' is a byte value
+ *  + 'n' is the length (in bytes) of the prefix of 's' to be searched.
+ *  + we could make these functions work on logical characters in utf8 strings,
+ *    but we don't really need to...
+ */
 static int firstone(char *s, int c, int n)
 {
     char *s2 = s + n;
@@ -149,6 +172,16 @@ static int lastone(char *s, int c, int n)
     of the entire text in pixels.
     */
 
+   /*-- moo: 
+    * + some variables from the original version have been renamed
+    * + variables with a "_b" suffix are raw byte strings, lengths, or offsets
+    * + variables with a "_c" suffix are logical character lengths or offsets
+    *   (assuming valid UTF-8 encoded byte string in x->x_buf)
+    * + a fair amount of O(n) computations required to convert between raw byte
+    *   offsets (needed by the C side) and logical character offsets (needed by
+    *   the GUI)
+    */
+
     /* LATER get this and sys_vgui to work together properly,
         breaking up messages as needed.  As of now, there's
         a limit of 1950 characters, imposed by sys_vgui(). */
@@ -167,14 +200,17 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
 	if (x) {
 		t_float dispx, dispy;
 		char smallbuf[200], *tempbuf;
-		int outchars = 0, nlines = 0, ncolumns = 0,
+		int outchars_b = 0, nlines = 0, ncolumns = 0,
 		    pixwide, pixhigh, font, fontwidth, fontheight, findx, findy;
 		int reportedindex = 0;
 		t_canvas *canvas = glist_getcanvas(x->x_glist);
-		int widthspec = x->x_text->te_width;
-		int widthlimit = (widthspec ? widthspec : BOXWIDTH);
-		int inindex = 0;
-		int selstart = 0, selend = 0;
+
+		int widthspec_c = x->x_text->te_width;
+		int widthlimit_c = (widthspec_c ? widthspec_c : BOXWIDTH);
+		int inindex_b = 0;
+		int inindex_c = 0;
+		int selstart_b = 0, selend_b = 0;
+		int x_bufsize_c = u8_charnum(x->x_buf, x->x_bufsize);
 		    /* if we're a GOP (the new, "goprect" style) borrow the font size
 		    from the inside to preserve the spacing */
 		if (pd_class(&x->x_text->te_pd) == canvas_class &&
@@ -189,74 +225,85 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
 		if (x->x_bufsize >= 100)
 		     tempbuf = (char *)t_getbytes(2 * x->x_bufsize + 1);
 		else tempbuf = smallbuf;
-		while (x->x_bufsize - inindex > 0)
+		while (x_bufsize_c - inindex_c > 0)
 		{
-		    int inchars = x->x_bufsize - inindex;
-		    int maxindex = (inchars > widthlimit ? widthlimit : inchars);
+			int inchars_b  = x->x_bufsize - inindex_b;
+			int inchars_c  = x_bufsize_c  - inindex_c;
+			int maxindex_c = (inchars_c > widthlimit_c ? widthlimit_c : inchars_c);
+			int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c);
 		    int eatchar = 1;
-		    int foundit = firstone(x->x_buf + inindex, '\n', maxindex);
-		    if (foundit < 0)
+			int foundit_b  = firstone(x->x_buf + inindex_b, '\n', maxindex_b);
+			int foundit_c;
+			if (foundit_b < 0)
 		    {
-		        if (inchars > widthlimit)
+		        if (inchars_c > widthlimit_c)
 		        {
-		            foundit = lastone(x->x_buf + inindex, ' ', maxindex);
-		            if (foundit < 0)
+					foundit_b = lastone(x->x_buf + inindex_b, ' ', maxindex_b);
+					if (foundit_b < 0)
 		            {
-		                foundit = maxindex;
+						foundit_b = maxindex_b;
+						foundit_c = maxindex_c;
 		                eatchar = 0;
 		            }
+					else
+						foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b);
 		        }
 		        else
 		        {
-		            foundit = inchars;
+					foundit_b = inchars_b;
+					foundit_c = inchars_c;
 		            eatchar = 0;
 		        }
 		    }
+			else
+				foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b);
+
 		    if (nlines == findy)
 		    {
 		        int actualx = (findx < 0 ? 0 :
-		            (findx > foundit ? foundit : findx));
-		        *indexp = inindex + actualx;
+		            (findx > foundit_c ? foundit_c : findx));
+		        *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx);
 		        reportedindex = 1;
 		    }
-		    strncpy(tempbuf+outchars, x->x_buf + inindex, foundit);
-		    if (x->x_selstart >= inindex &&
-		        x->x_selstart <= inindex + foundit + eatchar)
-		            selstart = x->x_selstart + outchars - inindex;
-		    if (x->x_selend >= inindex &&
-		        x->x_selend <= inindex + foundit + eatchar)
-		            selend = x->x_selend + outchars - inindex;
-		    outchars += foundit;
-		    inindex += (foundit + eatchar);
-		    if (inindex < x->x_bufsize)
-		        tempbuf[outchars++] = '\n';
-		    if (foundit > ncolumns)
-		        ncolumns = foundit;
+		    strncpy(tempbuf+outchars_b, x->x_buf + inindex_b, foundit_b);
+		    if (x->x_selstart >= inindex_b &&
+		        x->x_selstart <= inindex_b + foundit_b + eatchar)
+		            selstart_b = x->x_selstart + outchars_b - inindex_b;
+		    if (x->x_selend >= inindex_b &&
+		        x->x_selend <= inindex_b + foundit_b + eatchar)
+		            selend_b = x->x_selend + outchars_b - inindex_b;
+		    outchars_b += foundit_b;
+		    inindex_b += (foundit_b + eatchar);
+		    inindex_c += (foundit_c + eatchar);
+		    if (inindex_b < x->x_bufsize)
+		        tempbuf[outchars_b++] = '\n';
+		    if (foundit_c > ncolumns)
+		        ncolumns = foundit_c;
 		    nlines++;
 		}
 		if (!reportedindex)
-		    *indexp = outchars;
+		    *indexp = outchars_b;
 		dispx = text_xpix(x->x_text, x->x_glist);
 		dispy = text_ypix(x->x_text, x->x_glist);
 		if (nlines < 1) nlines = 1;
-		if (!widthspec)
+		if (!widthspec_c)
 		{
 		    while (ncolumns < 3)
 		    {
-		        tempbuf[outchars++] = ' ';
+		        tempbuf[outchars_b++] = ' ';
 		        ncolumns++;
 		    }
 		}
-		else ncolumns = widthspec;
+		else ncolumns = widthspec_c;
 		pixwide = ncolumns * fontwidth + (LMARGIN + RMARGIN);
 		pixhigh = nlines * fontheight + (TMARGIN + BMARGIN);
 
 		if (action == SEND_FIRST) {
 			//fprintf(stderr,"canvas=.x%lx %s\n", (t_int)canvas, tempbuf);
-		    sys_vgui("pdtk_text_new .x%lx.c %s %f %f {%.*s} %d %s\n",
-		        canvas, x->x_tag,
+		    sys_vgui("pdtk_text_new .x%lx.c {%s %s text} %f %f {%.*s} %d %s\n",
+		        canvas, x->x_tag, rtext_gettype(x)->s_name,
 		        dispx + LMARGIN, dispy + TMARGIN,
-		        outchars, tempbuf, sys_hostfontsize(font),
+		        outchars_b, tempbuf, sys_hostfontsize(font),
 		        (glist_isselected(x->x_glist,
 		            &x->x_glist->gl_gobj)? "$select_color" : "$text_color"));
 		}
@@ -267,7 +314,7 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
 				((t_glist *)(x->x_text))->gl_isgraph,
 				((t_glist *)(x->x_text))->gl_goprect );*/
 		    sys_vgui("pdtk_text_set .x%lx.c %s {%.*s}\n",
-		        canvas, x->x_tag, outchars, tempbuf);
+		        canvas, x->x_tag, outchars_b, tempbuf);
 			/*if ( pd_class(&x->x_text->te_pd) == canvas_class &&
 		    	((t_glist *)(x->x_text))->gl_isgraph &&
 		    	(((t_glist *)(x->x_text))->gl_goprect) ) {
@@ -279,19 +326,20 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
 		            pixwide, pixhigh, 0);
 		    if (x->x_active)
 		    {
-		        if (selend > selstart)
+		        if (selend_b > selstart_b)
 		        {
 		            sys_vgui(".x%lx.c select from %s %d\n", canvas, 
-		                x->x_tag, selstart);
+		                x->x_tag, u8_charnum(x->x_buf, selstart_b));
 		            sys_vgui(".x%lx.c select to %s %d\n", canvas, 
-		                x->x_tag, selend + (sys_oldtclversion ? 0 : -1));
+		                x->x_tag, u8_charnum(x->x_buf, selend_b)
+					  	+ (sys_oldtclversion ? 0 : -1));
 		            sys_vgui(".x%lx.c focus \"\"\n", canvas);        
 		        }
 		        else
 		        {
 		            sys_vgui(".x%lx.c select clear\n", canvas);
 		            sys_vgui(".x%lx.c icursor %s %d\n", canvas, x->x_tag,
-		                selstart);
+		                u8_charnum(x->x_buf, selstart_b));
 		            sys_vgui(".x%lx.c focus %s\n", canvas, x->x_tag);        
 		        }
 		    }
@@ -467,7 +515,7 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym)
                 ....
             } */
             if (x->x_selstart && (x->x_selstart == x->x_selend)) {
-                x->x_selstart--;
+                u8_dec(x->x_buf, &x->x_selstart);
 				if (glist_isvisible(glist_getcanvas(x->x_glist)))
 					sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist));
 			}
@@ -476,7 +524,7 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym)
         else if (n == 127)      /* delete */
         {
             if (x->x_selend < x->x_bufsize && (x->x_selstart == x->x_selend))
-                x->x_selend++;
+                u8_inc(x->x_buf, &x->x_selend);
 			if (glist_isvisible(glist_getcanvas(x->x_glist)))
 				sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist));
         }
@@ -491,7 +539,13 @@ void rtext_key(t_rtext *x, int keynum, t_symbol *keysym)
 /* at Guenter's suggestion, use 'n>31' to test wither a character might
 be printable in whatever 8-bit character set we find ourselves. */
 
-        if (n == '\n' || (n > 31 && n != 127))
+/*-- moo:
+  ... but test with "<" rather than "!=" in order to accomodate unicode
+  codepoints for n (which we get since Tk is sending the "%A" substitution
+  for bind <Key>), effectively reducing the coverage of this clause to 7
+  bits.  Case n>127 is covered by the next clause.
+*/
+        if (n == '\n' || (n > 31 && n < 127))
         {
             newsize = x->x_bufsize+1;
             x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize);
@@ -502,6 +556,19 @@ be printable in whatever 8-bit character set we find ourselves. */
             x->x_selstart = x->x_selstart + 1;
 			if (glist_isvisible(glist_getcanvas(x->x_glist)))
 				sys_vgui("pdtk_canvas_getscroll .x%lx.c\n", (t_int)glist_getcanvas(x->x_glist));
+        }
+		/*--moo: check for unicode codepoints beyond 7-bit ASCII --*/
+		else if (n > 127)
+		{
+            int ch_nbytes = u8_wc_nbytes(n);
+            newsize = x->x_bufsize + ch_nbytes;
+            x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize);
+            for (i = x->x_bufsize; i > x->x_selstart; i--)
+                x->x_buf[i] = x->x_buf[i-1];
+            x->x_bufsize = newsize;
+            /*-- moo: assume canvas_key() has encoded keysym as UTF-8 */
+            strncpy(x->x_buf+x->x_selstart, keysym->s_name, ch_nbytes);
+            x->x_selstart = x->x_selstart + ch_nbytes;
         }
         x->x_selend = x->x_selstart;
         x->x_glist->gl_editor->e_textdirty = 1;
@@ -509,7 +576,10 @@ be printable in whatever 8-bit character set we find ourselves. */
     else if (!strcmp(keysym->s_name, "Right"))
     {
         if (x->x_selend == x->x_selstart && x->x_selstart < x->x_bufsize)
-            x->x_selend = x->x_selstart = x->x_selstart + 1;
+        {
+            u8_inc(x->x_buf, &x->x_selstart);
+            x->x_selend = x->x_selstart;
+        }
         else
             x->x_selstart = x->x_selend;
 		last_sel = 0;		
@@ -517,7 +587,10 @@ be printable in whatever 8-bit character set we find ourselves. */
     else if (!strcmp(keysym->s_name, "Left"))
     {
         if (x->x_selend == x->x_selstart && x->x_selstart > 0)
-            x->x_selend = x->x_selstart = x->x_selstart - 1;
+        {
+            u8_dec(x->x_buf, &x->x_selstart);
+            x->x_selend = x->x_selstart;
+        }
         else
             x->x_selend = x->x_selstart;
 		last_sel = 0;
@@ -527,11 +600,11 @@ be printable in whatever 8-bit character set we find ourselves. */
 		if (!last_sel) last_sel = 2;
 		if (last_sel == 1 && x->x_selstart < x->x_selend) {
 		    if (x->x_selstart < x->x_bufsize)
-		        x->x_selstart =  x->x_selstart + 1;			
+		        u8_inc(x->x_buf, &x->x_selstart);		
 		} else {
 			last_sel = 2;
 		    if (x->x_selend < x->x_bufsize)
-		        x->x_selend =  x->x_selend + 1;
+		        u8_inc(x->x_buf, &x->x_selend);
 		}
     }
     else if (!strcmp(keysym->s_name, "ShiftLeft"))
@@ -542,16 +615,16 @@ be printable in whatever 8-bit character set we find ourselves. */
 		} else {
 			last_sel = 1;
 		    if (x->x_selstart > 0)
-		        x->x_selstart = x->x_selstart - 1;
+		        u8_dec(x->x_buf, &x->x_selstart);
 		}
     }
         /* this should be improved...  life's too short */
     else if (!strcmp(keysym->s_name, "Up") || !strcmp(keysym->s_name, "Home"))
     {
         if (x->x_selstart)
-            x->x_selstart--;
+            u8_dec(x->x_buf, &x->x_selstart);
         while (x->x_selstart > 0 && x->x_buf[x->x_selstart] != '\n')
-            x->x_selstart--;
+            u8_dec(x->x_buf, &x->x_selstart);
         x->x_selend = x->x_selstart;
 		last_sel = 0;
     }
@@ -559,9 +632,9 @@ be printable in whatever 8-bit character set we find ourselves. */
     {
         while (x->x_selend < x->x_bufsize &&
             x->x_buf[x->x_selend] != '\n')
-            x->x_selend++;
+            u8_inc(x->x_buf, &x->x_selend);
         if (x->x_selend < x->x_bufsize)
-            x->x_selend++;
+            u8_inc(x->x_buf, &x->x_selend);
         x->x_selstart = x->x_selend;
 		last_sel = 0;
     }
@@ -569,31 +642,31 @@ be printable in whatever 8-bit character set we find ourselves. */
     {
 		/* first find first non-space char going back */
 		while (x->x_selstart > 0 && x->x_buf[x->x_selstart-1] == ' ')
-			x->x_selstart--;
+			u8_dec(x->x_buf, &x->x_selstart);
 		/* now go back until you find another space or the beginning of the buffer */
         while (x->x_selstart > 0 &&
 		  x->x_buf[x->x_selstart] != '\n' &&
 		  x->x_buf[x->x_selstart-1] != ' ')
-            x->x_selstart--;
+            u8_dec(x->x_buf, &x->x_selstart);
 		if (x->x_buf[x->x_selstart+1] == ' ')
-			x->x_selstart++;
+			u8_inc(x->x_buf, &x->x_selstart);
 		x->x_selend = x->x_selstart;
     }
     else if (!strcmp(keysym->s_name, "CtrlRight"))
     {
 		/* now go forward until you find another space or the end of the buffer */
 		if (x->x_selend < x->x_bufsize - 1)
-			x->x_selend++;
+			u8_inc(x->x_buf, &x->x_selend);
         while (x->x_selend < x->x_bufsize &&
           x->x_buf[x->x_selend] != '\n' &&
 		  x->x_buf[x->x_selend] != ' ')
-            x->x_selend++;
+            u8_inc(x->x_buf, &x->x_selend);
 		/* now skip all the spaces and land before next word */
         while (x->x_selend < x->x_bufsize &&
 		  x->x_buf[x->x_selend] == ' ')
-            x->x_selend++;
+            u8_inc(x->x_buf, &x->x_selend);
 		if (x->x_selend > 0 && x->x_buf[x->x_selend-1] == ' ')
-			x->x_selend--;
+			u8_dec(x->x_buf, &x->x_selend);
 		x->x_selstart = x->x_selend;
     }
     else if (!strcmp(keysym->s_name, "CtrlShiftLeft"))
@@ -609,14 +682,17 @@ be printable in whatever 8-bit character set we find ourselves. */
 		}
 		/* first find first non-space char going back */
 		while (*target > 0 && x->x_buf[*target-1] == ' ')
-			(*target)--;
+			u8_dec(x->x_buf, target);
+			//(*target)--;
 		/* now go back until you find another space or the beginning of the buffer */
         while (*target > 0 &&
 		  x->x_buf[*target] != '\n' &&
 		  x->x_buf[*target-1] != ' ')
-            (*target)--;
+			u8_dec(x->x_buf, target);
+            //(*target)--;
 		if (x->x_buf[*target+1] == ' ')
-			(*target)++;
+			u8_inc(x->x_buf, target);
+			//(*target)++;
         if (x->x_selstart > x->x_selend) {
 			swap = x->x_selend;
 			x->x_selend = x->x_selstart;
@@ -637,17 +713,21 @@ be printable in whatever 8-bit character set we find ourselves. */
 		}
 		/* now go forward until you find another space or the end of the buffer */
 		if (*target < x->x_bufsize - 1)
-			(*target)++;
+			u8_inc(x->x_buf, target);
+			//(*target)++;
         while (*target < x->x_bufsize &&
           x->x_buf[*target] != '\n' &&
 		  x->x_buf[*target] != ' ')
-            (*target)++;
+			u8_inc(x->x_buf, target);
+            //(*target)++;
 		/* now skip all the spaces and land before next word */
         while (*target < x->x_bufsize &&
 		  x->x_buf[*target] == ' ')
-            (*target)++;
+			u8_inc(x->x_buf, target);
+            //(*target)++;
 		if (*target > 0 && x->x_buf[*target-1] == ' ')
-			(*target)--;
+			u8_dec(x->x_buf, target);
+			//(*target)--;
         if (x->x_selstart > x->x_selend) {
 			swap = x->x_selend;
 			x->x_selend = x->x_selstart;
diff --git a/pd/src/makefile.in b/pd/src/makefile.in
index 963eeaf27..f289e88dd 100644
--- a/pd/src/makefile.in
+++ b/pd/src/makefile.in
@@ -66,7 +66,8 @@ OPT_SAFE_SRC = g_canvas.c g_graph.c g_text.c g_rtext.c g_array.c g_template.c g_
     m_pd.c m_class.c m_obj.c m_atom.c m_memory.c m_binbuf.c \
     m_conf.c m_glob.c m_sched.c \
     s_main.c s_inter.c s_file.c s_print.c \
-    s_loader.c s_path.c s_entry.c s_audio.c s_midi.c\
+    s_loader.c s_path.c s_entry.c s_audio.c s_midi.c \
+	s_utf8.c \
     d_ugen.c d_arithmetic.c d_dac.c d_misc.c \
     d_fft.c d_global.c \
     d_resample.c \
diff --git a/pd/src/pd.tk b/pd/src/pd.tk
index 10f839b76..4d529aa37 100644
--- a/pd/src/pd.tk
+++ b/pd/src/pd.tk
@@ -18,6 +18,11 @@
 #http://ico.bukvic.net <ico@vt.edu>
 
 #puts stderr [info tclversion]
+##--moo: force default system and stdio encoding to UTF-8
+encoding system utf-8
+fconfigure stderr -encoding utf-8
+fconfigure stdout -encoding utf-8
+##--/moo
 
 if { [info tclversion] >= 8.5 } {
 
@@ -5947,9 +5952,9 @@ proc pdtk_text_new {canvasname myname x y text font color} {
 #        30 { set typeface [lindex $pd_fontlist 8] }
 #        36 { set typeface [lindex $pd_fontlist 9] }
 #    }
-
+	#[encoding convertfrom utf-8 $text]
     $canvasname create text $x $y -font [get_font_for_size $font] \
-        -tags $myname -text $text -fill $color -anchor nw
+        -tags $myname -text [encoding convertfrom utf-8 $text] -fill $color -anchor nw
 	$canvasname addtag text withtag $myname
 	#$canvasname bind $myname <Home> [concat $canvasname icursor $myname 0]
 	#$canvasname bind $myname <End> [concat $canvasname icursor $myname end]
@@ -5964,7 +5969,7 @@ proc pdtk_text_new {canvasname myname x y text font color} {
 
 ################ pdtk_text_set -- change the text ##################
 proc pdtk_text_set {canvasname myname text} {
-    catch {$canvasname itemconfig $myname -text $text}
+    catch {$canvasname itemconfig $myname -text [encoding convertfrom utf-8 $text]}
     #    pd [concat $myname size [$canvasname bbox $myname] \;]
 }
 
diff --git a/pd/src/s_utf8.c b/pd/src/s_utf8.c
new file mode 100644
index 000000000..a4179e7aa
--- /dev/null
+++ b/pd/src/s_utf8.c
@@ -0,0 +1,310 @@
+/*
+  Basic UTF-8 manipulation routines
+  by Jeff Bezanson
+  placed in the public domain Fall 2005
+
+  This code is designed to provide the utilities you need to manipulate
+  UTF-8 as an internal string encoding. These functions do not perform the
+  error checking normally needed when handling UTF-8 data, so if you happen
+  to be from the Unicode Consortium you will want to flay me alive.
+  I do this because error checking can be performed at the boundaries (I/O),
+  with these routines reserved for higher performance on data known to be
+  valid.
+
+  modified by Bryan Jurish (moo) March 2009
+  + removed some unneeded functions (escapes, printf etc), added others
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#ifdef WIN32
+#include <malloc.h>
+#else
+#include <alloca.h>
+#endif
+
+#include "s_utf8.h"
+
+static const u_int32_t offsetsFromUTF8[6] = {
+    0x00000000UL, 0x00003080UL, 0x000E2080UL,
+    0x03C82080UL, 0xFA082080UL, 0x82082080UL
+};
+
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+
+/* returns length of next utf-8 sequence */
+int u8_seqlen(char *s)
+{
+    return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
+}
+
+/* conversions without error checking
+   only works for valid UTF-8, i.e. no 5- or 6-byte sequences
+   srcsz = source size in bytes, or -1 if 0-terminated
+   sz = dest size in # of wide characters
+
+   returns # characters converted
+   dest will always be L'\0'-terminated, even if there isn't enough room
+   for all the characters.
+   if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
+*/
+int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
+{
+    u_int32_t ch;
+    char *src_end = src + srcsz;
+    int nb;
+    int i=0;
+
+    while (i < sz-1) {
+        nb = trailingBytesForUTF8[(unsigned char)*src];
+        if (srcsz == -1) {
+            if (*src == 0)
+                goto done_toucs;
+        }
+        else {
+            if (src + nb >= src_end)
+                goto done_toucs;
+        }
+        ch = 0;
+        switch (nb) {
+            /* these fall through deliberately */
+#if UTF8_SUPPORT_FULL_UCS4
+        case 5: ch += (unsigned char)*src++; ch <<= 6;
+        case 4: ch += (unsigned char)*src++; ch <<= 6;
+#endif
+        case 3: ch += (unsigned char)*src++; ch <<= 6;
+        case 2: ch += (unsigned char)*src++; ch <<= 6;
+        case 1: ch += (unsigned char)*src++; ch <<= 6;
+        case 0: ch += (unsigned char)*src++;
+        }
+        ch -= offsetsFromUTF8[nb];
+        dest[i++] = ch;
+    }
+ done_toucs:
+    dest[i] = 0;
+    return i;
+}
+
+/* srcsz = number of source characters, or -1 if 0-terminated
+   sz = size of dest buffer in bytes
+
+   returns # characters converted
+   dest will only be '\0'-terminated if there is enough space. this is
+   for consistency; imagine there are 2 bytes of space left, but the next
+   character requires 3 bytes. in this case we could NUL-terminate, but in
+   general we can't when there's insufficient space. therefore this function
+   only NUL-terminates if all the characters fit, and there's space for
+   the NUL as well.
+   the destination string will never be bigger than the source string.
+*/
+int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
+{
+    u_int32_t ch;
+    int i = 0;
+    char *dest_end = dest + sz;
+
+    while (srcsz<0 ? src[i]!=0 : i < srcsz) {
+        ch = src[i];
+        if (ch < 0x80) {
+            if (dest >= dest_end)
+                return i;
+            *dest++ = (char)ch;
+        }
+        else if (ch < 0x800) {
+            if (dest >= dest_end-1)
+                return i;
+            *dest++ = (ch>>6) | 0xC0;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x10000) {
+            if (dest >= dest_end-2)
+                return i;
+            *dest++ = (ch>>12) | 0xE0;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x110000) {
+            if (dest >= dest_end-3)
+                return i;
+            *dest++ = (ch>>18) | 0xF0;
+            *dest++ = ((ch>>12) & 0x3F) | 0x80;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        i++;
+    }
+    if (dest < dest_end)
+        *dest = '\0';
+    return i;
+}
+
+/* moo: get byte length of character number, or 0 if not supported */
+int u8_wc_nbytes(u_int32_t ch)
+{
+  if (ch < 0x80) return 1;
+  if (ch < 0x800) return 2;
+  if (ch < 0x10000) return 3;
+  if (ch < 0x200000) return 4;
+#if UTF8_SUPPORT_FULL_UCS4
+  /*-- moo: support full UCS-4 range? --*/
+  if (ch < 0x4000000) return 5;
+  if (ch < 0x7fffffffUL) return 6;
+#endif
+  return 0; /*-- bad input --*/
+}
+
+int u8_wc_toutf8(char *dest, u_int32_t ch)
+{
+    if (ch < 0x80) {
+        dest[0] = (char)ch;
+        return 1;
+    }
+    if (ch < 0x800) {
+        dest[0] = (ch>>6) | 0xC0;
+        dest[1] = (ch & 0x3F) | 0x80;
+        return 2;
+    }
+    if (ch < 0x10000) {
+        dest[0] = (ch>>12) | 0xE0;
+        dest[1] = ((ch>>6) & 0x3F) | 0x80;
+        dest[2] = (ch & 0x3F) | 0x80;
+        return 3;
+    }
+    if (ch < 0x110000) {
+        dest[0] = (ch>>18) | 0xF0;
+        dest[1] = ((ch>>12) & 0x3F) | 0x80;
+        dest[2] = ((ch>>6) & 0x3F) | 0x80;
+        dest[3] = (ch & 0x3F) | 0x80;
+        return 4;
+    }
+    return 0;
+}
+
+/*-- moo --*/
+int u8_wc_toutf8_nul(char *dest, u_int32_t ch)
+{
+  int sz = u8_wc_toutf8(dest,ch);
+  dest[sz] = '\0';
+  return sz;
+}
+
+/* charnum => byte offset */
+int u8_offset(char *str, int charnum)
+{
+    char *string = str;
+
+    while (charnum > 0 && *string != '\0') {
+        if (*string++ & 0x80) {
+            if (!isutf(*string)) {
+                ++string;
+                if (!isutf(*string)) {
+                    ++string;
+                    if (!isutf(*string)) {
+                        ++string;
+                    }
+                }
+            }
+        }
+        --charnum;
+    }
+
+    return (int)(string - str);
+}
+
+/* byte offset => charnum */
+int u8_charnum(char *s, int offset)
+{
+    int charnum = 0;
+    char *string = s;
+    char *const end = string + offset;
+
+    while (string < end && *string != '\0') {
+        if (*string++ & 0x80) {
+            if (!isutf(*string)) {
+                ++string;
+                if (!isutf(*string)) {
+                    ++string;
+                    if (!isutf(*string)) {
+                        ++string;
+                    }
+                }
+            }
+        }
+        ++charnum;
+    }
+    return charnum;
+}
+
+/* reads the next utf-8 sequence out of a string, updating an index */
+u_int32_t u8_nextchar(char *s, int *i)
+{
+    u_int32_t ch = 0;
+    int sz = 0;
+
+    do {
+        ch <<= 6;
+        ch += (unsigned char)s[(*i)++];
+        sz++;
+    } while (s[*i] && !isutf(s[*i]));
+    ch -= offsetsFromUTF8[sz-1];
+
+    return ch;
+}
+
+/* number of characters */
+int u8_strlen(char *s)
+{
+    int count = 0;
+    int i = 0;
+
+    while (u8_nextchar(s, &i) != 0)
+        count++;
+
+    return count;
+}
+
+void u8_inc(char *s, int *i)
+{
+    if (s[(*i)++] & 0x80) {
+        if (!isutf(s[*i])) {
+            ++(*i);
+            if (!isutf(s[*i])) {
+                ++(*i);
+                if (!isutf(s[*i])) {
+                    ++(*i);
+                }
+            }
+        }
+    }
+}
+
+void u8_dec(char *s, int *i)
+{
+    (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
+           isutf(s[--(*i)]) || --(*i));
+}
+
+/*-- moo --*/
+void u8_inc_ptr(char **sp)
+{
+  (void)(isutf(*(++(*sp))) || isutf(*(++(*sp))) ||
+	 isutf(*(++(*sp))) || ++(*sp));
+}
+
+/*-- moo --*/
+void u8_dec_ptr(char **sp)
+{
+  (void)(isutf(*(--(*sp))) || isutf(*(--(*sp))) ||
+	 isutf(*(--(*sp))) || --(*sp));
+}
diff --git a/pd/src/s_utf8.h b/pd/src/s_utf8.h
new file mode 100644
index 000000000..56c40d480
--- /dev/null
+++ b/pd/src/s_utf8.h
@@ -0,0 +1,88 @@
+#ifndef S_UTF8_H
+#define S_UTF8_H
+
+/*--moo--*/
+#ifndef u_int32_t
+# define u_int32_t unsigned int
+#endif
+
+#ifndef UCS4
+# define UCS4 u_int32_t
+#endif
+
+/* UTF8_SUPPORT_FULL_UCS4
+ *  define this to support the full potential range of UCS-4 codepoints
+ *  (in anticipation of a future UTF-8 standard)
+ */
+/*#define UTF8_SUPPORT_FULL_UCS4 1*/
+#undef UTF8_SUPPORT_FULL_UCS4
+
+/* UTF8_MAXBYTES
+ *   maximum number of bytes required to represent a single character in UTF-8
+ *
+ * UTF8_MAXBYTES1 = UTF8_MAXBYTES+1 
+ *  maximum bytes per character including NUL terminator
+ */
+#ifdef UTF8_SUPPORT_FULL_UCS4
+# ifndef UTF8_MAXBYTES
+#  define UTF8_MAXBYTES  6
+# endif
+# ifndef UTF8_MAXBYTES1
+#  define UTF8_MAXBYTES1 7
+# endif
+#else
+# ifndef UTF8_MAXBYTES
+#  define UTF8_MAXBYTES  4
+# endif
+# ifndef UTF8_MAXBYTES1
+#  define UTF8_MAXBYTES1 5
+# endif
+#endif
+/*--/moo--*/
+
+/* is c the start of a utf8 sequence? */
+#define isutf(c) (((c)&0xC0)!=0x80)
+
+/* convert UTF-8 data to wide character */
+int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz);
+
+/* the opposite conversion */
+int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz);
+
+/* moo: get byte length of character number, or 0 if not supported */
+int u8_wc_nbytes(u_int32_t ch);
+
+/* moo: compute required storage for UTF-8 encoding of 's[0..n-1]' */
+int u8_wcs_nbytes(u_int32_t *ucs, int size);
+
+/* single character to UTF-8, no NUL termination */
+int u8_wc_toutf8(char *dest, u_int32_t ch);
+
+/* moo: single character to UTF-8, with NUL termination */
+int u8_wc_toutf8_nul(char *dest, u_int32_t ch);
+
+/* character number to byte offset */
+int u8_offset(char *str, int charnum);
+
+/* byte offset to character number */
+int u8_charnum(char *s, int offset);
+
+/* return next character, updating an index variable */
+u_int32_t u8_nextchar(char *s, int *i);
+
+/* move to next character */
+void u8_inc(char *s, int *i);
+
+/* move to previous character */
+void u8_dec(char *s, int *i);
+
+/* moo: move pointer to next character */
+void u8_inc_ptr(char **sp);
+
+/* moo: move pointer to previous character */
+void u8_dec_ptr(char **sp);
+
+/* returns length of next utf-8 sequence */
+int u8_seqlen(char *s);
+
+#endif /* S_UTF8_H */
-- 
GitLab