From 0231ce074ef34d184b293c4be34b96b264d12287 Mon Sep 17 00:00:00 2001
From: Jonathan Wilkes <jon.w.wilkes@gmail.com>
Date: Fri, 7 Jul 2017 21:08:22 -0400
Subject: [PATCH] fix a very nasty utf8 code-point counter/offsetter that can
 easily dereferencing garbage thanks to Pd's use of non-null-terminated
 strings

---
 pd/src/g_rtext.c |  6 +++--
 pd/src/s_utf8.c  | 69 +++++++++++++++++++++++++++++++++++++++++++++++-
 pd/src/s_utf8.h  |  2 +-
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/pd/src/g_rtext.c b/pd/src/g_rtext.c
index 6c4b9cdfc..04a8f6761 100644
--- a/pd/src/g_rtext.c
+++ b/pd/src/g_rtext.c
@@ -248,7 +248,8 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
             int inchars_c  = x_bufsize_c  - inindex_c;
             int maxindex_c =
                 (inchars_c > widthlimit_c ? widthlimit_c : inchars_c);
-            int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c);
+            int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c,
+                x->x_bufsize - inindex_b);
             int eatchar = 1;
             //fprintf(stderr, "firstone <%s> inindex_b=%d maxindex_b=%d\n", x->x_buf + inindex_b, inindex_b, maxindex_b);
             int foundit_b  = firstone(x->x_buf + inindex_b, '\n', maxindex_b);
@@ -291,7 +292,8 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
             {
                 int actualx = (findx < 0 ? 0 :
                     (findx > foundit_c ? foundit_c : findx));
-                *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx);
+                *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx,
+                    x->x_bufsize - inindex_b);
                 reportedindex = 1;
             }
             strncpy(tempbuf+outchars_b, x->x_buf + inindex_b, foundit_b);
diff --git a/pd/src/s_utf8.c b/pd/src/s_utf8.c
index 6bc8e230c..8cf20aa27 100644
--- a/pd/src/s_utf8.c
+++ b/pd/src/s_utf8.c
@@ -185,8 +185,27 @@ int u8_wc_toutf8_nul(char *dest, uint32_t ch)
 }
 
 /* charnum => byte offset */
-int u8_offset(char *str, int charnum)
+int u8_offset(char *str, int charnum, int bufsize)
 {
+    /* Implementation #1 of a tricky encoding in an unsafe language. It
+       assumes that we're dealing with null-terminated strings, but
+       x_buf of rtext isn't null-terminated. */
+    /*
+    int offs=0;
+
+    while (charnum > 0 && str[offs]) {
+        (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
+               isutf(str[++offs]) || ++offs);
+        charnum--;
+    }
+    return offs;
+    */
+
+    /* Implementation number 2 apparently tried to fix that. Instead, it
+       just made a reimplementation that still potential dereferences
+       non-existent pointers _if_ we try to get the offset at the last
+       character _and_ that last character happens to be wide. */
+    /*
     char *string = str;
 
     while (charnum > 0 && *string != '\0') {
@@ -205,11 +224,36 @@ int u8_offset(char *str, int charnum)
     }
 
     return (int)(string - str);
+    */
+
+    /* Here is an _extremely_ conservative implementation that protects
+       against dereferencing garbage pointers. */
+    int offs = 0;
+    if (isutf(str[offs]))
+    {
+        for (offs = 0; offs < bufsize; offs++)
+        {
+            if (isutf(str[offs]))
+            {
+                if (charnum <= 0)
+                    break;
+                charnum -= 1;
+            }
+        }
+    }
+    else
+    {
+        bug("u8_offset");
+    }
+    return offs;
 }
 
 /* byte offset => charnum */
 int u8_charnum(char *s, int offset)
 {
+    /* This has the same problem as the commented implementations of u8_offset
+       above. */
+    /*
     int charnum = 0;
     char *string = s;
     char *const end = string + offset;
@@ -229,6 +273,29 @@ int u8_charnum(char *s, int offset)
         ++charnum;
     }
     return charnum;
+    */
+
+    /* The original implementation which doesn't work well with
+       strings that aren't null terminated */
+    /*
+    int charnum = 0, offs=0;
+
+    while (offs < offset && s[offs]) {
+        (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
+               isutf(s[++offs]) || ++offs);
+        charnum++;
+    }
+    return charnum;
+    */
+
+    /* An _extremely_ conservative implementation to avoid dereferencing
+       garbage. */
+    int charnum = 0, i;
+    for (i = 0; i < offset; i++)
+    {
+        if (isutf(s[i])) charnum += 1;
+    }
+    return charnum;
 }
 
 /* reads the next utf-8 sequence out of a string, updating an index */
diff --git a/pd/src/s_utf8.h b/pd/src/s_utf8.h
index c8969e8be..88a324925 100644
--- a/pd/src/s_utf8.h
+++ b/pd/src/s_utf8.h
@@ -59,7 +59,7 @@ int u8_wc_toutf8(char *dest, uint32_t ch);
 int u8_wc_toutf8_nul(char *dest, uint32_t ch);
 
 /* character number to byte offset */
-int u8_offset(char *str, int charnum);
+int u8_offset(char *str, int charnum, int bufsize);
 
 /* byte offset to character number */
 int u8_charnum(char *s, int offset);
-- 
GitLab