poparser: support feeds anything[1/2]

this is the first patch. It mainly solves the case like: `msgid "ss" "gg"`. This is a valid case(proved by GNU msgfmt). Just like C compiler, we should concat strings, treat that string as `"ssgg"`. Previous feed_line assume there's only one `"xxx"` string in one line. For example, when it parses `msgid "ss"`, whole line is marked as parsed, so `"gg"` is ignored. Now it turns into a loop and will consume exactly the part it parsed, so that all strings can be consumed.
author: xhe <xw897002528@gmail.com> 2020-01-22 14:42:04 +0800
committer: xhe <xw897002528@gmail.com> 2020-01-22 22:03:27 +0800
commit: c83067e03d2929f358dbdc10bea49d3df0e5e78e (patch)
tree: 208d3c4e98618f530b1fc9bb37d6a563178e2468
parent: adaa9c64921e80f2b8dd3610ffb508618b9204f3 (diff)
download: gettext-tiny-c83067e03d2929f358dbdc10bea49d3df0e5e78e.tar.gz
1 files changed, 206 insertions, 196 deletions
diff --git a/src/poparser.c b/src/poparser.c
index d7aafb1..2780372 100644
--- a/src/poparser.c
+++ b/src/poparser.c
@@ -112,6 +112,7 @@ static inline enum po_error poparser_clean(struct po_parser *p, po_message_t msg
 
 enum po_error poparser_feed_line(struct po_parser *p, char* in, size_t in_len) {
 	char *line = in;
+	size_t line_pos;
 	size_t line_len = in_len;
 	po_message_t msg = &p->msg;
 	int cnt = 0;
@@ -119,132 +120,69 @@ enum po_error poparser_feed_line(struct po_parser *p, char* in, size_t in_len) {
 	size_t len;
 	char *x, *y, *z;
 
-	if (line_len == 0 || line[0] == '\n') {
-		// ignore blank lines
-		return po_success;
-	} else if (line[0] == '#') {
-		if (p->previous == po_str) {
-			if ( (t = poparser_clean(p, msg)) != po_success)
-				return t;
-		}
+	// if we need to conv encodings
+	if (p->cd) {
+		x = p->buf;
+		len = p->bufsize;
+		if (iconv(p->cd, &line, &line_len, &x, &len) == (size_t)-1)
+			return -po_failed_iconv;
 
-		switch (line[1]) {
-		case ',':
-			x = &line[2];
-			while (*x && (y = strpbrk(x, " ,\n"))) {
-				if (y != x && !memcmp(x, "fuzzy", y-x)) {
-					msg->flags |= PO_FUZZY;
-				}
-				x = y + strspn(y, " ,\n");
-			}
-			break;
-		case '.':
-			// extracted comments for translators, ignore
-		case ':':
-			// reference comments for translators, ignore
-		case '|':
-			// previous untranslated strings for translators, ignore
-		default:
-			// ignore normal comments
-			return po_success;
-		}
-	} else if (line[0] == '"') {
-		if ( (y = strrchr(x = &line[1], '"')) == NULL)
-			return -po_excepted_token;
-
-		len = y - x;
-		*y = 0;
-
-		if (p->cd) {
-			line = x;
-			line_len = len + 1;
-			x = p->buf;
-			len = p->bufsize;
-			if (iconv(p->cd, &line, &line_len, &x, &len) == (size_t)-1) 
-				return -po_failed_iconv;
-
-			if (line_len != 0)
-				return -po_failed_iconv;
-
-			len = x - p->buf; 
-			x = p->buf;
-		}
+		if (line_len != 0)
+			return -po_failed_iconv;
 
-		for (cnt = 0; cnt < st_max; cnt++) {
-			if (strstr(x, sysdep_str[cnt])) {
-				msg->sysdep |= sysdep[cnt];
-			}
-		}
-
-		switch (p->previous) {
-		case po_str:
-			if ((t = poparser_feed_hdr(p, x)) != po_success) {
-				return t;
-			}
-
-			cnt = p->strcnt - 1;
-			if (p->stage == ps_parse) {
-				len = unescape(x, &msg->str[cnt][msg->strlen[cnt]], p->max_strlen[cnt]);
-			}
+		line_len = x - p->buf;
+		line = p->buf;
+	}
 
-			msg->strlen[cnt] += len;
+	for (line_pos=0; line_pos < line_len;) {
+		switch (line[line_pos]) {
+		case '\n':
+		case ' ':
+			line_pos++;
 			break;
-		case po_plural:
-			if (p->stage == ps_parse) {
-				len = unescape(x, &msg->plural[msg->plural_len], p->max_plural_len);
+		case '#':
+			if (p->previous == po_str) {
+				if ( (t = poparser_clean(p, msg)) != po_success)
+					return t;
 			}
 
-			msg->plural_len += len;
-			break;
-		case po_id:
-			if (p->stage == ps_parse) {
-				len = unescape(x, &msg->id[msg->id_len], p->max_id_len);
+			switch (line[line_pos+1]) {
+			case ',':
+				x = &line[line_pos+2];
+				while (*x && (y = strpbrk(x, " ,\n"))) {
+					if (y != x && !memcmp(x, "fuzzy", y-x)) {
+						msg->flags |= PO_FUZZY;
+					}
+					x = y + strspn(y, " ,\n");
+				}
+				break;
+			case '.':
+				// extracted comments for translators, ignore
+			case ':':
+				// reference comments for translators, ignore
+			case '|':
+				// previous untranslated strings for translators, ignore
+			default:
+				// ignore normal comments
+				break;
 			}
 
-			msg->id_len += len;
+			// whole line is commented
+			line_pos = line_len;
 			break;
-		case po_ctxt:
-			if (p->stage == ps_parse) {
-				len = unescape(x, &msg->ctxt[msg->ctxt_len], p->max_ctxt_len);
+		case '"':
+			y = x = &line[line_pos+1];
+			while (true) {
+				if ( (y = strchr(y, '"')) == NULL)
+					return -po_excepted_token;
+
+				// only if it's not an escaped "
+				if (*(y-1) != '\\') break;
 			}
 
-			msg->ctxt_len += len;
-			break;
-		default:
-			return -po_invalid_entry;
-		}
-	} else if ((z = strstarts(line, "msg"))) {
-		if ( (x = strchr(z, '"')) == NULL)
-			return -po_excepted_token;
-
-		if ( (y = strrchr(x+1, '"')) == NULL)
-			return -po_excepted_token;
-
-		len = y - ++x;
-		*y = 0;
-
-		if (p->cd) {
-			line = x;
-			line_len = len + 1;
-			x = p->buf;
-			len = p->bufsize;
-
-			if (iconv(p->cd, &line, &line_len, &x, &len) == (size_t)-1) 
-				return -po_failed_iconv;
-
-			if (line_len != 0)
-				return -po_failed_iconv;
-
-			len = x - p->buf; 
-			x = p->buf;
-		}
-
-		if ((y = strstarts(z, "ctxt")) && isspace(*y)) {
-			if ( (t = poparser_clean(p, msg)) != po_success)
-				return t;
-
-			if (msg->id_len || msg->plural_len)
-				return -po_invalid_entry;
+			len = y - x;
+			*y = 0;
+			line_pos += len + 2;
 
 			for (cnt = 0; cnt < st_max; cnt++) {
 				if (strstr(x, sysdep_str[cnt])) {
@@ -252,105 +190,177 @@ enum po_error poparser_feed_line(struct po_parser *p, char* in, size_t in_len) {
 				}
 			}
 
-			if (p->stage == ps_parse) {
-				if (msg->ctxt == NULL) {
-					return -po_internal;
+			switch (p->previous) {
+			case po_str:
+				if ((t = poparser_feed_hdr(p, x)) != po_success) {
+					return t;
 				}
 
-				len = unescape(x, msg->ctxt, p->max_ctxt_len);
-			}
-
-			msg->ctxt_len = len;
-			p->previous = po_ctxt;
-		} else if ((y = strstarts(z, "id")) && isspace(*y)) {
-			if ( (t = poparser_clean(p, msg)) != po_success)
-				return t;
-
-			if (msg->plural_len)
-				return -po_invalid_entry;
-
-			for (cnt = 0; cnt < st_max; cnt++) {
-				if (strstr(x, sysdep_str[cnt])) {
-					msg->sysdep |= sysdep[cnt];
+				cnt = p->strcnt - 1;
+				if (p->stage == ps_parse) {
+					len = unescape(x, &msg->str[cnt][msg->strlen[cnt]], p->max_strlen[cnt]);
 				}
-			}
 
-			if (p->stage == ps_parse) {
-				if (msg->id == NULL) {
-					return -po_internal;
+				msg->strlen[cnt] += len;
+				break;
+			case po_plural:
+				if (p->stage == ps_parse) {
+					len = unescape(x, &msg->plural[msg->plural_len], p->max_plural_len);
 				}
 
-				len = unescape(x, msg->id, p->max_id_len);
-			}
-
-			msg->id_len = len;
-			p->previous = po_id;
-		} else if ((y = strstarts(z, "id_plural")) && isspace(*y)) {
-			if (!msg->id_len || p->strcnt)
-				return -po_invalid_entry;
-
-			if (p->stage == ps_parse) {
-				if (msg->plural == NULL) {
-					return -po_internal;
+				msg->plural_len += len;
+				break;
+			case po_id:
+				if (p->stage == ps_parse) {
+					len = unescape(x, &msg->id[msg->id_len], p->max_id_len);
 				}
 
-				len = unescape(x, msg->plural, p->max_plural_len);
-			}
+				msg->id_len += len;
+				break;
+			case po_ctxt:
+				if (p->stage == ps_parse) {
+					len = unescape(x, &msg->ctxt[msg->ctxt_len], p->max_ctxt_len);
+				}
 
-			msg->plural_len = len;
-			p->previous = po_plural;
-		} else if ((y = strstarts(z, "str"))) {
-			if (!msg->id_len && !p->first)
+				msg->ctxt_len += len;
+				break;
+			default:
 				return -po_invalid_entry;
+			}
 
-			if (isspace(*y)) {
-				if (p->strcnt || msg->plural_len)
-					return -po_invalid_entry;
-
-				cnt = (p->strcnt = 1) - 1;
-			} else if (*y == '[') {
-				if (!msg->plural_len)
-					return -po_invalid_entry;
-
-				if (y[2] != ']' || !isspace(y[3])) return -po_excepted_token;
+			break;
+		default:
+			if ((z = strstarts(&line[line_pos], "msg"))) {
+				if ( (x = strchr(z, '"')) == NULL)
+					return -po_excepted_token;
 
-				p->strcnt = (cnt = y[1] - '0') + 1;
+				y = ++x;
+				while (true) {
+					if ( (y = strchr(y, '"')) == NULL)
+						return -po_excepted_token;
 
-				if (p->strict && p->strcnt > p->hdr.nplurals) {
-					return -po_plurals_overflow;
+					// only if it's not an escaped "
+					if (*(y-1) != '\\') break;
 				}
-			} else {
-				return -po_excepted_token;
-			}
 
-			if ((t = poparser_feed_hdr(p, x)) != po_success) {
-				return t;
-			}
-
-			if (p->stage == ps_parse) {
-				if (msg->str[cnt] == NULL) {
-					return -po_internal;
+				len = y - x;
+				*y = 0;
+				line_pos += y - &line[line_pos] + 1;
+
+				if ((y = strstarts(z, "ctxt")) && isspace(*y)) {
+					if ( (t = poparser_clean(p, msg)) != po_success)
+						return t;
+
+					if (msg->id_len || msg->plural_len)
+						return -po_invalid_entry;
+
+					for (cnt = 0; cnt < st_max; cnt++) {
+						if (strstr(x, sysdep_str[cnt])) {
+							msg->sysdep |= sysdep[cnt];
+						}
+					}
+
+					if (p->stage == ps_parse) {
+						if (msg->ctxt == NULL) {
+							return -po_internal;
+						}
+
+						len = unescape(x, msg->ctxt, p->max_ctxt_len);
+					}
+
+					msg->ctxt_len = len;
+					p->previous = po_ctxt;
+				} else if ((y = strstarts(z, "id")) && isspace(*y)) {
+					if ( (t = poparser_clean(p, msg)) != po_success)
+						return t;
+
+					if (msg->plural_len)
+						return -po_invalid_entry;
+
+					for (cnt = 0; cnt < st_max; cnt++) {
+						if (strstr(x, sysdep_str[cnt])) {
+							msg->sysdep |= sysdep[cnt];
+						}
+					}
+
+					if (p->stage == ps_parse) {
+						if (msg->id == NULL) {
+							return -po_internal;
+						}
+
+						len = unescape(x, msg->id, p->max_id_len);
+					}
+
+					msg->id_len = len;
+					p->previous = po_id;
+				} else if ((y = strstarts(z, "id_plural")) && isspace(*y)) {
+					if (!msg->id_len || p->strcnt)
+						return -po_invalid_entry;
+
+					if (p->stage == ps_parse) {
+						if (msg->plural == NULL) {
+							return -po_internal;
+						}
+
+						len = unescape(x, msg->plural, p->max_plural_len);
+					}
+
+					msg->plural_len = len;
+					p->previous = po_plural;
+				} else if ((y = strstarts(z, "str"))) {
+					if (!msg->id_len && !p->first)
+						return -po_invalid_entry;
+
+					if (isspace(*y)) {
+						if (p->strcnt || msg->plural_len)
+							return -po_invalid_entry;
+
+						cnt = (p->strcnt = 1) - 1;
+					} else if (*y == '[') {
+						if (!msg->plural_len)
+							return -po_invalid_entry;
+
+						if (y[2] != ']' || !isspace(y[3])) return -po_excepted_token;
+
+						p->strcnt = (cnt = y[1] - '0') + 1;
+
+						if (p->strict && p->strcnt > p->hdr.nplurals) {
+							return -po_plurals_overflow;
+						}
+					} else {
+						return -po_excepted_token;
+					}
+
+					if ((t = poparser_feed_hdr(p, x)) != po_success) {
+						return t;
+					}
+
+					if (p->stage == ps_parse) {
+						if (msg->str[cnt] == NULL) {
+							return -po_internal;
+						}
+
+						len = unescape(x, msg->str[cnt], p->max_strlen[cnt]);
+					}
+
+					msg->strlen[cnt] = len;
+					p->previous = po_str;
+				} else {
+					return -po_invalid_entry;
 				}
-
-				len = unescape(x, msg->str[cnt], p->max_strlen[cnt]);
 			}
-
-			msg->strlen[cnt] = len;
-			p->previous = po_str;
-		} else {
-			return -po_invalid_entry;
 		}
-	}
 
-	if (p->stage == ps_size) {
-		if (p->max_strlen[cnt] < msg->strlen[cnt])
-			p->max_strlen[cnt] = msg->strlen[cnt] + 1;
-		if (p->max_plural_len < msg->plural_len)
-			p->max_plural_len = msg->plural_len + 1;
-		if (p->max_id_len < msg->id_len)
-			p->max_id_len = msg->id_len + 1;
-		if (p->max_ctxt_len < msg->ctxt_len)
-			p->max_ctxt_len = msg->ctxt_len + 1;
+		if (p->stage == ps_size) {
+			if (p->max_strlen[cnt] < msg->strlen[cnt])
+				p->max_strlen[cnt] = msg->strlen[cnt] + 1;
+			if (p->max_plural_len < msg->plural_len)
+				p->max_plural_len = msg->plural_len + 1;
+			if (p->max_id_len < msg->id_len)
+				p->max_id_len = msg->id_len + 1;
+			if (p->max_ctxt_len < msg->ctxt_len)
+				p->max_ctxt_len = msg->ctxt_len + 1;
+		}
 	}
 
 	return po_success;
@@ -417,7 +427,7 @@ size_t poparser_sysdep(const char *in, char *out, int num) {
 				m = strlen(y);
 				if (outs)
 					memcpy(out, y, m);
-				out += m; 
+				out += m;
 
 				break;
 			}
author	xhe <xw897002528@gmail.com>	2020-01-22 14:42:04 +0800
committer	xhe <xw897002528@gmail.com>	2020-01-22 22:03:27 +0800
commit	c83067e03d2929f358dbdc10bea49d3df0e5e78e (patch)
tree	208d3c4e98618f530b1fc9bb37d6a563178e2468
parent	adaa9c64921e80f2b8dd3610ffb508618b9204f3 (diff)
download	gettext-tiny-c83067e03d2929f358dbdc10bea49d3df0e5e78e.tar.gz