fix chop formats with non PV vars

[perl #119847], [perl #119849], [perl #119851] Strange vars like ties, overloads, or stringified refs (and in recent perls, pure NOK vars) would generally do the wrong thing in formats when the var is treated as a string and repeatedly chopped, as in ^<<<~~ and similar. This would manifest itself in infinite loops, utf8 errors etc. A recent change that stopped a stringified NOK getting converted into a POK made the same badness happen for plain NVs too. This commit contains two main fixes. First, the chopping was done using sv_chop(), which only worked on POK strings. If its !POK, we now do sv_setpvn() instead, which is less efficient, but will ensure the right thing is always done. Secondly, we make sure that the sv is accessed only once per cycle, doing s = SvPV(sv, len) or similar. After that, all access is done only via s and len. One place was using SvPVX(sv), and several places were using the sv for utf8<->byte length conversions, such as sv_pos_b2u(). It turns out that all the complex utf8 handling could be enormously simplified. Since the code that needed to do utf8/byte length conversions already scanned the string looking for suitable split points (such as spaces or \n or \r), it was easiest to include any utf8 processing in the same loop - i.e. incrementing s by UTF8SKIP(s) each time, but incrementing the character count by 1. The original diagnosis and reporting of this issue was done by Nicholas Clark, who also supplied most of the tests.
author: David Mitchell <davem@iabyn.com> 2013-11-07 12:17:26 +0000
committer: David Mitchell <davem@iabyn.com> 2013-11-11 11:21:40 +0000
commit: 9b4bdfd44e0e6d44a447f231c281f967c7ca35c9 (patch)
tree: faa5f43fb5fc1b063fc2328572ffbe00a53f7cf0 /pp_ctl.c
parent: 4a73dc0bc2ea5e1c3bd27a27acd3942dc6aa6c1e (diff)
download: perl-9b4bdfd44e0e6d44a447f231c281f967c7ca35c9.tar.gz
1 files changed, 82 insertions, 131 deletions
diff --git a/pp_ctl.c b/pp_ctl.c
index 1ab3f42054..95727f201a 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -466,7 +466,8 @@ PP(pp_formline)
     I32 arg;
     SV *sv = NULL; /* current item */
     const char *item = NULL;/* string value of current item */
-    I32 itemsize  = 0;	    /* length of current item, possibly truncated */
+    I32 itemsize  = 0;	    /* length (chars) of item, possibly truncated */
+    I32 itembytes = 0;	    /* as itemsize, but length in bytes */
     I32 fieldsize = 0;	    /* width of current field */
     I32 lines = 0;	    /* number of lines that have been output */
     bool chopspace = (strchr(PL_chopset, ' ') != NULL); /* does $: have space */
@@ -474,7 +475,7 @@ PP(pp_formline)
     STRLEN linemark = 0;    /* pos of start of line in output */
     NV value;
     bool gotsome = FALSE;   /* seen at least one non-blank item on this line */
-    STRLEN len;
+    STRLEN len;             /* length of current sv */
     STRLEN linemax;	    /* estimate of output size in bytes */
     bool item_is_utf8 = FALSE;
     bool targ_is_utf8 = FALSE;
@@ -569,133 +570,85 @@ PP(pp_formline)
 
 	case FF_CHECKNL: /* find max len of item (up to \n) that fits field */
 	    {
-		const char *send;
 		const char *s = item = SvPV_const(sv, len);
-		itemsize = len;
-		if (DO_UTF8(sv)) {
-		    itemsize = sv_len_utf8(sv);
-		    if (itemsize != (I32)len) {
-			I32 itembytes;
-			if (itemsize > fieldsize) {
-			    itemsize = fieldsize;
-			    itembytes = itemsize;
-			    sv_pos_u2b(sv, &itembytes, 0);
-			}
-			else
-			    itembytes = len;
-			send = chophere = s + itembytes;
-			while (s < send) {
-			    if (! isCNTRL(*s))
-				gotsome = TRUE;
-			    else if (*s == '\n')
-				break;
-			    s++;
-			}
-			item_is_utf8 = TRUE;
-			itemsize = s - item;
-			sv_pos_b2u(sv, &itemsize);
-			break;
-		    }
-		}
-		item_is_utf8 = FALSE;
-		if (itemsize > fieldsize)
-		    itemsize = fieldsize;
-		send = chophere = s + itemsize;
-		while (s < send) {
-		    if (! isCNTRL(*s))
-			gotsome = TRUE;
-		    else if (*s == '\n')
-			break;
-		    s++;
-		}
-		itemsize = s - item;
+		const char *send = s + len;
+
+                itemsize = 0;
+		item_is_utf8 = DO_UTF8(sv);
+                while (s < send) {
+                    if (!isCNTRL(*s))
+                        gotsome = TRUE;
+                    else if (*s == '\n')
+                        break;
+
+                    if (item_is_utf8)
+                        s += UTF8SKIP(s);
+                    else
+                        s++;
+                    itemsize++;
+                    if (itemsize == fieldsize)
+                        break;
+                }
+                itembytes = s - item;
 		break;
 	    }
 
 	case FF_CHECKCHOP: /* like CHECKNL, but up to highest split point */
 	    {
 		const char *s = item = SvPV_const(sv, len);
-		itemsize = len;
-		if (DO_UTF8(sv)) {
-		    itemsize = sv_len_utf8(sv);
-		    if (itemsize != (I32)len) {
-			I32 itembytes;
-			if (itemsize <= fieldsize) {
-			    const char *send = chophere = s + itemsize;
-			    while (s < send) {
-				if (*s == '\r') {
-				    itemsize = s - item;
-				    chophere = s;
-				    break;
-				}
-				if (! isCNTRL(*s))
-				    gotsome = TRUE;
-                                s++;
-			    }
-			}
-			else {
-			    const char *send;
-			    itemsize = fieldsize;
-			    itembytes = itemsize;
-			    sv_pos_u2b(sv, &itembytes, 0);
-			    send = chophere = s + itembytes;
-			    while (s < send || (s == send && isSPACE(*s))) {
-				if (isSPACE(*s)) {
-				    if (chopspace)
-					chophere = s;
-				    if (*s == '\r')
-					break;
-				}
-				else {
-				    if (! isCNTRL(*s))
-					gotsome = TRUE;
-				    if (strchr(PL_chopset, *s))
-					chophere = s + 1;
-				}
-				s++;
-			    }
-			    itemsize = chophere - item;
-			    sv_pos_b2u(sv, &itemsize);
-			}
-			item_is_utf8 = TRUE;
-			break;
-		    }
-		}
-		item_is_utf8 = FALSE;
-		if (itemsize <= fieldsize) {
-		    const char *const send = chophere = s + itemsize;
-		    while (s < send) {
-			if (*s == '\r') {
-			    itemsize = s - item;
-			    chophere = s;
-			    break;
-			}
-			if (! isCNTRL(*s))
-			    gotsome = TRUE;
+		const char *send = s + len;
+                I32 size = 0;
+
+                chophere = NULL;
+		item_is_utf8 = DO_UTF8(sv);
+                while (s < send) {
+                    /* look for a legal split position */
+                    if (isSPACE(*s)) {
+                        if (*s == '\r') {
+                            chophere = s;
+                            itemsize = size;
+                            break;
+                        }
+                        if (chopspace) {
+                            /* provisional split point */
+                            chophere = s;
+                            itemsize = size;
+                        }
+                        /* we delay testing fieldsize until after we've
+                         * processed the possible split char directly
+                         * following the last field char; so if fieldsize=3
+                         * and item="a b cdef", we consume "a b", not "a".
+                         * Ditto further down.
+                         */
+                        if (size == fieldsize)
+                            break;
+                    }
+                    else {
+                        if (strchr(PL_chopset, *s)) {
+                            /* provisional split point */
+                            /* for a non-space split char, we include
+                             * the split char; hence the '+1' */
+                            chophere = s + 1;
+                            itemsize = size;
+                        }
+                        if (size == fieldsize)
+                            break;
+                        if (!isCNTRL(*s))
+                            gotsome = TRUE;
+                    }
+
+                    if (item_is_utf8)
+                        s += UTF8SKIP(s);
+                    else
                         s++;
-		    }
-		}
-		else {
-		    const char *send;
-		    itemsize = fieldsize;
-		    send = chophere = s + itemsize;
-		    while (s < send || (s == send && isSPACE(*s))) {
-			if (isSPACE(*s)) {
-			    if (chopspace)
-				chophere = s;
-			    if (*s == '\r')
-				break;
-			}
-			else {
-			    if (! isCNTRL(*s))
-				gotsome = TRUE;
-			    if (strchr(PL_chopset, *s))
-				chophere = s + 1;
-			}
-			s++;
-		    }
-		    itemsize = chophere - item;
-		}
+                    size++;
+                }
+                if (!chophere || s == send) {
+                    chophere = s;
+                    itemsize = size;
+                }
+                itembytes = chophere - item;
+
 		break;
 	    }
 
@@ -719,16 +672,9 @@ PP(pp_formline)
 	    break;
 
 	case FF_ITEM: /* append a text item, while blanking ctrl chars */
-	    to_copy = itemsize;
+	    to_copy = itembytes;
 	    source = (U8 *)item;
 	    trans = 1;
-	    if (item_is_utf8) {
-		/* convert to_copy from chars to bytes */
-		U8 *s = source;
-		while (to_copy--)
-		   s += UTF8SKIP(s);
-		to_copy = s - source;
-	    }
 	    goto append;
 
 	case FF_CHOP: /* (for ^*) chop the current item */
@@ -738,7 +684,12 @@ PP(pp_formline)
 		    while (isSPACE(*s))
 			s++;
 		}
-		sv_chop(sv,s);
+                if (SvPOKp(sv))
+                    sv_chop(sv,s);
+                else
+                    /* tied, overloaded or similar strangeness.
+                     * Do it the hard way */
+                    sv_setpvn(sv, s, len - (s-item));
 		SvSETMAGIC(sv);
 		break;
 	    }
@@ -763,7 +714,7 @@ PP(pp_formline)
 		while (s < send) {
 		    if (*s++ == '\n') {
 			if (oneline) {
-			    to_copy = s - SvPVX_const(sv) - 1;
+			    to_copy = s - item - 1;
 			    chophere = s;
 			    break;
 			} else {
author	David Mitchell <davem@iabyn.com>	2013-11-07 12:17:26 +0000
committer	David Mitchell <davem@iabyn.com>	2013-11-11 11:21:40 +0000
commit	9b4bdfd44e0e6d44a447f231c281f967c7ca35c9 (patch)
tree	faa5f43fb5fc1b063fc2328572ffbe00a53f7cf0 /pp_ctl.c
parent	4a73dc0bc2ea5e1c3bd27a27acd3942dc6aa6c1e (diff)
download	perl-9b4bdfd44e0e6d44a447f231c281f967c7ca35c9.tar.gz