fix a very nasty utf8 code-point counter/offsetter that can easily...

fix a very nasty utf8 code-point counter/offsetter that can easily dereferencing garbage thanks to Pd's use of non-null-terminated strings

fix a very nasty utf8 code-point counter/offsetter that can easily...
fix a very nasty utf8 code-point counter/offsetter that can easily dereferencing garbage thanks to Pd's use of non-null-terminated strings
0231ce07 · Jonathan Wilkes · 00448223 · 0231ce07 · 0231ce07 · 0231ce07
Commit 0231ce07 authored 7 years ago by Jonathan Wilkes
--- a/pd/src/g_rtext.c
+++ b/pd/src/g_rtext.c
@@ -248,7 +248,8 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
            int inchars_c  = x_bufsize_c  - inindex_c;
            int maxindex_c =
                (inchars_c > widthlimit_c ? widthlimit_c : inchars_c);
-            int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c);
+            int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c,
+                x->x_bufsize - inindex_b);
            int eatchar = 1;
            //fprintf(stderr, "firstone <%s> inindex_b=%d maxindex_b=%d\n", x->x_buf + inindex_b, inindex_b, maxindex_b);
            int foundit_b  = firstone(x->x_buf + inindex_b, '\n', maxindex_b);
@@ -291,7 +292,8 @@ static void rtext_senditup(t_rtext *x, int action, int *widthp, int *heightp,
            {
                int actualx = (findx < 0 ? 0 :
                    (findx > foundit_c ? foundit_c : findx));
-                *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx);
+                *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx,
+                    x->x_bufsize - inindex_b);
                reportedindex = 1;
            }
            strncpy(tempbuf+outchars_b, x->x_buf + inindex_b, foundit_b);

--- a/pd/src/s_utf8.c
+++ b/pd/src/s_utf8.c
@@ -185,8 +185,27 @@ int u8_wc_toutf8_nul(char *dest, uint32_t ch)
 }

 /* charnum => byte offset */
-int u8_offset(char *str, int charnum)
+int u8_offset(char *str, int charnum, int bufsize)
 {
+    /* Implementation #1 of a tricky encoding in an unsafe language. It
+       assumes that we're dealing with null-terminated strings, but
+       x_buf of rtext isn't null-terminated. */
+    /*
+    int offs=0;
+
+    while (charnum > 0 && str[offs]) {
+        (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
+               isutf(str[++offs]) || ++offs);
+        charnum--;
+    }
+    return offs;
+    */
+
+    /* Implementation number 2 apparently tried to fix that. Instead, it
+       just made a reimplementation that still potential dereferences
+       non-existent pointers _if_ we try to get the offset at the last
+       character _and_ that last character happens to be wide. */
+    /*
    char *string = str;

    while (charnum > 0 && *string != '\0') {
@@ -205,11 +224,36 @@ int u8_offset(char *str, int charnum)
    }

    return (int)(string - str);
+    */
+
+    /* Here is an _extremely_ conservative implementation that protects
+       against dereferencing garbage pointers. */
+    int offs = 0;
+    if (isutf(str[offs]))
+    {
+        for (offs = 0; offs < bufsize; offs++)
+        {
+            if (isutf(str[offs]))
+            {
+                if (charnum <= 0)
+                    break;
+                charnum -= 1;
+            }
+        }
+    }
+    else
+    {
+        bug("u8_offset");
+    }
+    return offs;
 }

 /* byte offset => charnum */
 int u8_charnum(char *s, int offset)
 {
+    /* This has the same problem as the commented implementations of u8_offset
+       above. */
+    /*
    int charnum = 0;
    char *string = s;
    char *const end = string + offset;
@@ -229,6 +273,29 @@ int u8_charnum(char *s, int offset)
        ++charnum;
    }
    return charnum;
+    */
+
+    /* The original implementation which doesn't work well with
+       strings that aren't null terminated */
+    /*
+    int charnum = 0, offs=0;
+
+    while (offs < offset && s[offs]) {
+        (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
+               isutf(s[++offs]) || ++offs);
+        charnum++;
+    }
+    return charnum;
+    */
+
+    /* An _extremely_ conservative implementation to avoid dereferencing
+       garbage. */
+    int charnum = 0, i;
+    for (i = 0; i < offset; i++)
+    {
+        if (isutf(s[i])) charnum += 1;
+    }
+    return charnum;
 }

 /* reads the next utf-8 sequence out of a string, updating an index */

--- a/pd/src/s_utf8.h
+++ b/pd/src/s_utf8.h
@@ -59,7 +59,7 @@ int u8_wc_toutf8(char *dest, uint32_t ch);
 int u8_wc_toutf8_nul(char *dest, uint32_t ch);

 /* character number to byte offset */
-int u8_offset(char *str, int charnum);
+int u8_offset(char *str, int charnum, int bufsize);

 /* byte offset to character number */
 int u8_charnum(char *s, int offset);