New memcpy()-based wordwrap() implementation. The simple case

(single-character break, no forced break) appears to be about 60% faster, and there's simply no comparison for non-simple cases with non-trivial amounts of text. The old algorithm was O(n^2) (with an unfortunately large constant factor) because of the use of strncat(), the new one is O(n). Added some more tests, too. @ - Made wordwrap() significantly faster. (Jim) # test case: $t = join('',file('ChangeLog')); $w = wordwrap($t,10,"\n",1); # new code completes in less than a second. i'm still waiting for the # old code to finish.
author: jim winstead <jimw@php.net> 2002-01-05 20:46:43 +0000
committer: jim winstead <jimw@php.net> 2002-01-05 20:46:43 +0000
commit: ca15b22212d3ea30c931e4cb7f4e1be869ed36fc (patch)
tree: 625d05aa1b7484346d79a5b64070c8fed07a2313 /ext/standard/string.c
parent: e56fb1639b9c5da4f4d6b6b7b9c99888bfdd045b (diff)
download: php-git-ca15b22212d3ea30c931e4cb7f4e1be869ed36fc.tar.gz
1 files changed, 75 insertions, 98 deletions
diff --git a/ext/standard/string.c b/ext/standard/string.c
index 6b897346e0..3149130089 100644
--- a/ext/standard/string.c
+++ b/ext/standard/string.c
@@ -613,9 +613,11 @@ PHP_FUNCTION(ltrim)
    Wraps buffer to selected number of characters using string break char */
 PHP_FUNCTION(wordwrap)
 {
-	char *text, *breakchar = "\n", *newtext;
+	const char *text, *breakchar = "\n";
+	char *newtext;
 	int textlen, breakcharlen = 1, newtextlen;
-	long linelength = 75, i = 0, l = 0, pgr = 0, last = 0;
+	long current = 0, laststart = 0, lastspace = 0;
+	long linelength = 75;
 	zend_bool docut = 0;
 
 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|lsb", &text, &textlen, &linelength, &breakchar, &breakcharlen, &docut) == FAILURE) {
@@ -634,121 +636,96 @@ PHP_FUNCTION(wordwrap)
 	   additional storage space */
 	if (breakcharlen == 1 && !docut) {
 		newtext = estrndup(text, textlen);
-		while (newtext[i] != '\0') {
 
-			/* prescan line to see if it is greater than linelength */
-			l = 0;
-			while (newtext[i+l] != breakchar[0]) {
-				if (newtext[i+l] == '\0') {
-					l--;
-					break;
-				}
-				l++;
+		laststart = lastspace = 0;
+		for (current = 0; current < textlen; current++) {
+			if (text[current] == breakchar[0]) {
+				laststart = lastspace = current;
 			}
-
-			if (l >= linelength) {
-				pgr = l;
-				l = linelength;
-
-				/* needs breaking; work backwards to find previous word */
-				while (l >= 0) {
-					if (newtext[i+l] == ' ') {
-						newtext[i+l] = breakchar[0];
-						break;
-					}
-					l--;
-				}
-
-				if (l == -1) {
-					/* couldn't break is backwards, try looking forwards */
-					l = linelength;
-					while (l <= pgr) {
-						if(newtext[i+l] == ' ') {
-							newtext[i+l] = breakchar[0];
-							break;
-						}
-						l++;
-					}
+			else if (text[current] == ' ') {
+				if (current - laststart >= linelength) {
+					newtext[current] = breakchar[0];
+					laststart = current;
 				}
+				lastspace = current;
+			}
+			else if (current - laststart >= linelength
+					&& laststart != lastspace) {
+				newtext[lastspace] = breakchar[0];
+				laststart = lastspace;
 			}
-
-			i += l + 1;
 		}
 
 		RETURN_STRINGL(newtext, textlen, 0);
 	}
 	else {
-		/* Multiple character line break */
-		newtextlen = textlen * (breakcharlen + 1) + 1;
+		/* Multiple character line break or forced cut */
+		if (linelength > 0) {
+			newtextlen = textlen + (textlen/linelength) * breakcharlen + 1;
+		}
+		else {
+			newtextlen = textlen * (breakcharlen + 1) + 1;
+		}
 		newtext = emalloc(newtextlen);
-		newtext[0] = '\0';
 
-		i = 0;
-		while (text[i] != '\0') {
-
-			/* prescan line to see if it is greater than linelength */
-			l = 0;
-			while (text[i+l] != '\0') {
-				if (text[i+l] == breakchar[0]) {
-					if (breakcharlen == 1 || !strncmp(text+i+l, breakchar, breakcharlen))
-						break;
-				}
-				l++;
+		/* now keep track of the actual new text length */
+		newtextlen = 0;
+
+		laststart = lastspace = 0;
+		for (current = 0; current < textlen; current++) {
+			/* when we hit an existing break, copy to new buffer, and
+			 * fix up laststart and lastspace */
+			if (text[current] == breakchar[0]
+				&& current + breakcharlen < textlen
+				&& !strncmp(text+current, breakchar, breakcharlen)) {
+				memcpy(newtext+newtextlen, text+laststart, current-laststart+breakcharlen);
+				newtextlen += current-laststart+breakcharlen;
+				current += breakcharlen - 1;
+				laststart = lastspace = current + 1;
 			}
-
-			if (l >= linelength) {
-				pgr = l;
-				l = linelength;
-
-				/* needs breaking; work backwards to find previous word */
-				while (l >= 0) {
-					if (text[i+l] == ' ') {
-						strncat(newtext, text+last, i+l-last);
-						strncat(newtext, breakchar, breakcharlen);
-						last = i + l + 1;
-						break;
-					}
-					l--;
+			/* if it is a space, check if it is at the line boundary,
+			 * copy and insert a break, or just keep track of it */
+			else if (text[current] == ' ') {
+				if (current - laststart >= linelength) {
+					memcpy(newtext+newtextlen, text+laststart, current-laststart);
+					newtextlen += current - laststart;
+					memcpy(newtext+newtextlen, breakchar, breakcharlen);
+					newtextlen += breakcharlen;
+					laststart = current + 1;
 				}
-
-				if (l == -1) {
-					/* couldn't break it backwards, try looking forwards */
-					l = linelength - 1;
-					while (l <= pgr) {
-						if (!docut) {
-							if (text[i+l] == ' ') {
-								strncat(newtext, text+last, i+l-last);
-								strncat(newtext, breakchar, breakcharlen);
-								last = i + l + 1;
-								++l;
-								break;
-							}
-						}
-						/* cut if longer than allowed */
-						else {
-							if (text[i+l] == ' ' || l > i-last) {
-								strncat(newtext, text+last, i+l-last+1);
-								strncat(newtext, breakchar, breakcharlen);
-								last = i + l + 1;
-								++l;
-								break;
-							}
-						}
-						l++;
-					}
-				}
-				i += l + 1;
+				lastspace = current;
 			}
-			else {
-				i += (l ? l : 1);
+			/* if we are cutting, and we've accumulated enough
+			 * characters, copy and insert a break. */
+			else if (current - laststart >= linelength && docut) {
+				memcpy(newtext+newtextlen, text+laststart, current-laststart);
+				newtextlen += current - laststart;
+				memcpy(newtext+newtextlen, breakchar, breakcharlen);
+				newtextlen += breakcharlen;
+				laststart = lastspace = current;
+			}
+			/* if the current word puts us over the linelength, copy
+			 * back up until the last space, insert a break, and move
+			 * up the laststart */
+			else if (current - laststart >= linelength
+					&& laststart < lastspace) {
+				memcpy(newtext+newtextlen, text+laststart, lastspace-laststart);
+				newtextlen += lastspace - laststart;
+				memcpy(newtext+newtextlen, breakchar, breakcharlen);
+				newtextlen += breakcharlen;
+				laststart = lastspace = lastspace + 1;
 			}
 		}
 
-		if (i + l > last) {
-			strncat(newtext, text+last, i+l-last);
+		/* copy over any stragglers */
+		if (laststart != current) {
+			memcpy(newtext+newtextlen, text+laststart, current-laststart);
+			newtextlen += current - laststart;
 		}
 
-		RETURN_STRINGL(newtext, strlen(newtext), 0);
+		newtext[newtextlen] = '\0';
+
+		RETURN_STRINGL(newtext, newtextlen, 0);
 	}
 }
 /* }}} */
author	jim winstead <jimw@php.net>	2002-01-05 20:46:43 +0000
committer	jim winstead <jimw@php.net>	2002-01-05 20:46:43 +0000
commit	ca15b22212d3ea30c931e4cb7f4e1be869ed36fc (patch)
tree	625d05aa1b7484346d79a5b64070c8fed07a2313 /ext/standard/string.c
parent	e56fb1639b9c5da4f4d6b6b7b9c99888bfdd045b (diff)
download	php-git-ca15b22212d3ea30c931e4cb7f4e1be869ed36fc.tar.gz