summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-11-18 06:00:17 +0200
committerArnold D. Robbins <arnold@skeeve.com>2016-11-18 06:00:17 +0200
commite8b0cf14d975304166c58a2d04a2943ab821367a (patch)
tree6984492869ca192ee6a7fdf330e6867a5f61ae09
parentcc04afb329cea035d0d9b67cd3b677e06b2f3996 (diff)
downloadgawk-e8b0cf14d975304166c58a2d04a2943ab821367a.tar.gz
Audit use of stptr for NUL termination. Update doc before merge to master.
-rw-r--r--ChangeLog24
-rw-r--r--array.c17
-rw-r--r--awk.h2
-rw-r--r--awkgram.c2
-rw-r--r--awkgram.y2
-rw-r--r--builtin.c48
-rw-r--r--debug.c55
-rw-r--r--doc/ChangeLog6
-rw-r--r--doc/gawk.info1020
-rw-r--r--doc/gawk.texi148
-rw-r--r--doc/gawktexi.in148
-rw-r--r--eval.c16
-rw-r--r--interpret.h2
-rw-r--r--io.c30
-rw-r--r--mpfr.c12
-rw-r--r--msg.c6
16 files changed, 941 insertions, 597 deletions
diff --git a/ChangeLog b/ChangeLog
index f691d340..de343887 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2016-11-17 Arnold D. Robbins <arnold@skeeve.com>
+
+ General cleanup for zero termination of strings.
+
+ * array.c (do_delete): Use %.*s.
+ (value_info): Get length and use %.*s.
+ (asort_actual): Save and restore character after end.
+ * awkgram.y (split_comment): Use make_string, not make_str_node.
+ * builtin.c (do_fflush): Use %.*s.
+ (locale_category_from_argument, do_dcgettext, do_dcngettext,
+ do_bindtextdomain): Save and restore character after end.
+ * debug.c (do_info, print_array, print_subscript, do_print_var,
+ do_set_var, display, do_watch, print_watch_item, serialize_subscript,
+ do_print_f): Use %.*s.
+ * eval.c (cmp_nodes, fmt_index): Save and restore character after end.
+ * interpret.h (r_interpret): Fix compuation for concatenation of
+ wide strings.
+ * io.c (is_non_fatal_redirect): Add length parameter; save and
+ restore character after last. Adjust all other declarations and calls.
+ (do_close): Save and restore character after end.
+ * mpfr.c (ieee_fmts): Adjust table indentation.
+ (do_mpfr_strtonum): Clear wide string members of the union.
+ * msg.c (err): Use %.*s.
+
2016-11-07 Arnold D. Robbins <arnold@skeeve.com>
* awk.h [USER_INPUT]: Renamed from MAYBE_NUM.
diff --git a/array.c b/array.c
index c189fe3b..cee1c729 100644
--- a/array.c
+++ b/array.c
@@ -583,8 +583,8 @@ do_delete(NODE *symbol, int nsubs)
if (val == NULL) {
if (do_lint) {
subs = force_string(subs);
- lintwarn(_("delete: index `%s' not in array `%s'"),
- subs->stptr, array_vname(symbol));
+ lintwarn(_("delete: index `%.*s' not in array `%s'"),
+ (int) subs->stlen, subs->stptr, array_vname(symbol));
}
/* avoid memory leak, free all subs */
free_subs(i);
@@ -660,7 +660,6 @@ value_info(NODE *n)
{
#define PREC_NUM -1
-#define PREC_STR -1
if (n == Nnull_string || n == Null_field) {
fprintf(output_fp, "<(null)>");
@@ -669,7 +668,7 @@ value_info(NODE *n)
if ((n->flags & (STRING|STRCUR)) != 0) {
fprintf(output_fp, "<");
- fprintf(output_fp, "\"%.*s\"", PREC_STR, n->stptr);
+ fprintf(output_fp, "\"%.*s\"", (int) n->stlen, n->stptr);
if ((n->flags & (NUMBER|NUMCUR)) != 0) {
#ifdef HAVE_MPFR
if (is_mpg_float(n))
@@ -702,6 +701,8 @@ value_info(NODE *n)
fprintf(output_fp, ":");
if ((n->flags & (STRING|STRCUR)) == STRCUR) {
+ size_t len;
+
fprintf(output_fp, "][");
fprintf(output_fp, "stfmt=%d, ", n->stfmt);
/*
@@ -710,13 +711,14 @@ value_info(NODE *n)
* was originally set as a string, or it's a number that has
* an integer value.
*/
+ len = fmt_list[n->stfmt]->stlen;
+ fmt_list[n->stfmt]->stptr[len] = '\0';
fprintf(output_fp, "FMT=\"%s\"",
n->stfmt == STFMT_UNUSED ? "<unused>"
: fmt_list[n->stfmt]->stptr);
}
#undef PREC_NUM
-#undef PREC_STR
}
@@ -803,6 +805,7 @@ asort_actual(int nargs, sort_context_t ctxt)
NODE **list = NULL, **ptr, **lhs;
unsigned long num_elems, i;
const char *sort_str;
+ char save;
if (nargs == 3) /* 3rd optional arg */
s = POP_STRING();
@@ -811,6 +814,8 @@ asort_actual(int nargs, sort_context_t ctxt)
s = force_string(s);
sort_str = s->stptr;
+ save = s->stptr[s->stlen];
+ s->stptr[s->stlen] = '\0';
if (s->stlen == 0) { /* default sorting */
if (ctxt == ASORT)
sort_str = "@val_type_asc";
@@ -851,6 +856,7 @@ asort_actual(int nargs, sort_context_t ctxt)
/* sorting happens inside assoc_list */
list = assoc_list(array, sort_str, ctxt);
+ s->stptr[s->stlen] = save;
DEREF(s);
num_elems = assoc_length(array);
@@ -913,6 +919,7 @@ asort_actual(int nargs, sort_context_t ctxt)
arr = make_array();
subs = force_string(subs);
arr->vname = subs->stptr;
+ arr->vname[subs->stlen] = '\0';
subs->stptr = NULL;
subs->flags &= ~STRCUR;
arr->parent_array = array; /* actual parent, not the temporary one. */
diff --git a/awk.h b/awk.h
index 9a7f7eea..dcf97bb0 100644
--- a/awk.h
+++ b/awk.h
@@ -1562,7 +1562,7 @@ extern struct redirect *getredirect(const char *str, int len);
extern bool inrec(IOBUF *iop, int *errcode);
extern int nextfile(IOBUF **curfile, bool skipping);
extern bool is_non_fatal_std(FILE *fp);
-extern bool is_non_fatal_redirect(const char *str);
+extern bool is_non_fatal_redirect(const char *str, size_t len);
/* main.c */
extern int arg_assign(char *arg, bool initing);
extern int is_std_var(const char *var);
diff --git a/awkgram.c b/awkgram.c
index 78839fe9..f552d2b9 100644
--- a/awkgram.c
+++ b/awkgram.c
@@ -5615,7 +5615,7 @@ split_comment(void)
if (p[l] == '\n' && p[l+1] == '\n') {
function_comment = comment_to_save;
n = function_comment->memory;
- function_comment->memory = make_str_node(p + l + 2, n->stlen - l - 2, 0);
+ function_comment->memory = make_string(p + l + 2, n->stlen - l - 2);
/* create program comment */
program_comment = bcalloc(Op_comment, 1, sourceline);
program_comment->source_file = comment_to_save->source_file;
diff --git a/awkgram.y b/awkgram.y
index 4ed1e78b..e05269ec 100644
--- a/awkgram.y
+++ b/awkgram.y
@@ -3238,7 +3238,7 @@ split_comment(void)
if (p[l] == '\n' && p[l+1] == '\n') {
function_comment = comment_to_save;
n = function_comment->memory;
- function_comment->memory = make_str_node(p + l + 2, n->stlen - l - 2, 0);
+ function_comment->memory = make_string(p + l + 2, n->stlen - l - 2);
/* create program comment */
program_comment = bcalloc(Op_comment, 1, sourceline);
program_comment->source_file = comment_to_save->source_file;
diff --git a/builtin.c b/builtin.c
index e6cfee3b..9a45e10f 100644
--- a/builtin.c
+++ b/builtin.c
@@ -135,7 +135,7 @@ wrerror:
/* otherwise die verbosely */
- if ((rp != NULL) ? is_non_fatal_redirect(rp->value) : is_non_fatal_std(fp))
+ if ((rp != NULL) ? is_non_fatal_redirect(rp->value, strlen(rp->value)) : is_non_fatal_std(fp))
update_ERRNO_int(errno);
else
fatal(_("%s to \"%s\" failed (%s)"), from,
@@ -194,6 +194,7 @@ do_fflush(int nargs)
FILE *fp;
int status = 0;
const char *file;
+ int len;
/*
* November, 2012.
@@ -220,6 +221,7 @@ do_fflush(int nargs)
tmp = POP_STRING();
file = tmp->stptr;
+ len = tmp->stlen;
/* fflush("") */
if (tmp->stlen == 0) {
@@ -234,11 +236,11 @@ do_fflush(int nargs)
if (rp != NULL) {
if ((rp->flag & (RED_WRITE|RED_APPEND)) == 0) {
if ((rp->flag & RED_PIPE) != 0)
- warning(_("fflush: cannot flush: pipe `%s' opened for reading, not writing"),
- file);
+ warning(_("fflush: cannot flush: pipe `%.*s' opened for reading, not writing"),
+ len, file);
else
- warning(_("fflush: cannot flush: file `%s' opened for reading, not writing"),
- file);
+ warning(_("fflush: cannot flush: file `%.*s' opened for reading, not writing"),
+ len, file);
DEREF(tmp);
return make_number((AWKNUM) status);
}
@@ -246,13 +248,13 @@ do_fflush(int nargs)
if (fp != NULL)
status = rp->output.gawk_fflush(fp, rp->output.opaque);
else if ((rp->flag & RED_TWOWAY) != 0)
- warning(_("fflush: cannot flush: two-way pipe `%s' has closed write end"),
- file);
+ warning(_("fflush: cannot flush: two-way pipe `%.*s' has closed write end"),
+ len, file);
} else if ((fp = stdfile(tmp->stptr, tmp->stlen)) != NULL) {
status = fflush(fp);
} else {
status = -1;
- warning(_("fflush: `%s' is not an open file, pipe or co-process"), file);
+ warning(_("fflush: `%.*s' is not an open file, pipe or co-process"), len, file);
}
DEREF(tmp);
return make_number((AWKNUM) status);
@@ -1685,7 +1687,7 @@ do_printf(int nargs, int redirtype)
rp = redirect(redir_exp, redirtype, & errflg, true);
if (rp != NULL) {
if ((rp->flag & RED_TWOWAY) != 0 && rp->output.fp == NULL) {
- if (is_non_fatal_redirect(redir_exp->stptr)) {
+ if (is_non_fatal_redirect(redir_exp->stptr, redir_exp->stlen)) {
update_ERRNO_int(EBADF);
return;
}
@@ -2169,7 +2171,7 @@ do_print(int nargs, int redirtype)
rp = redirect(redir_exp, redirtype, & errflg, true);
if (rp != NULL) {
if ((rp->flag & RED_TWOWAY) != 0 && rp->output.fp == NULL) {
- if (is_non_fatal_redirect(redir_exp->stptr)) {
+ if (is_non_fatal_redirect(redir_exp->stptr, redir_exp->stlen)) {
update_ERRNO_int(EBADF);
return;
}
@@ -2243,7 +2245,7 @@ do_print_rec(int nargs, int redirtype)
rp = redirect(redir_exp, redirtype, & errflg, true);
if (rp != NULL) {
if ((rp->flag & RED_TWOWAY) != 0 && rp->output.fp == NULL) {
- if (is_non_fatal_redirect(redir_exp->stptr)) {
+ if (is_non_fatal_redirect(redir_exp->stptr, redir_exp->stlen)) {
update_ERRNO_int(EBADF);
return;
}
@@ -3679,6 +3681,8 @@ localecategory_from_argument(NODE *t)
char *category;
int lc_cat = -1;
+ char save = t->stptr[t->stlen];
+ t->stptr[t->stlen] = '\0';
category = t->stptr;
/* binary search the table */
@@ -3697,6 +3701,7 @@ localecategory_from_argument(NODE *t)
break;
}
}
+ t->stptr[t->stlen] = save;
if (lc_cat == -1) /* not there */
fatal(_("dcgettext: `%s' is not a valid locale category"), category);
@@ -3725,6 +3730,8 @@ do_dcgettext(int nargs)
#if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
int lc_cat;
char *domain;
+ char save;
+ bool saved_end = false;
if (nargs == 3) { /* third argument */
tmp = POP_STRING();
@@ -3736,6 +3743,9 @@ do_dcgettext(int nargs)
if (nargs >= 2) { /* second argument */
t2 = POP_STRING();
domain = t2->stptr;
+ save = domain[t2->stlen];
+ domain[t2->stlen] = '\0';
+ saved_end = true;
} else
domain = TEXTDOMAIN;
#else
@@ -3754,6 +3764,8 @@ do_dcgettext(int nargs)
#if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
the_result = dcgettext(domain, string, lc_cat);
+ if (saved_end)
+ domain[t2->stlen] = save;
if (t2 != NULL)
DEREF(t2);
#else
@@ -3776,6 +3788,8 @@ do_dcngettext(int nargs)
#if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
int lc_cat;
char *domain;
+ char save;
+ bool saved_end = false;
if (nargs == 5) { /* fifth argument */
tmp = POP_STRING();
@@ -3788,6 +3802,9 @@ do_dcngettext(int nargs)
if (nargs >= 4) { /* fourth argument */
t3 = POP_STRING();
domain = t3->stptr;
+ save = domain[t3->stlen];
+ domain[t3->stlen] = '\0';
+ saved_end = true;
} else
domain = TEXTDOMAIN;
#else
@@ -3814,6 +3831,8 @@ do_dcngettext(int nargs)
#if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
the_result = dcngettext(domain, string1, string2, number, lc_cat);
+ if (saved_end)
+ domain[t3->stlen] = save;
if (t3 != NULL)
DEREF(t3);
#else
@@ -3846,10 +3865,15 @@ do_bindtextdomain(int nargs)
/* set defaults */
directory = NULL;
domain = TEXTDOMAIN;
+ char save;
+ bool saved_end = false;
if (nargs == 2) { /* second argument */
t2 = POP_STRING();
domain = (const char *) t2->stptr;
+ save = t2->stptr[t2->stlen];
+ t2->stptr[t2->stlen] = '\0';
+ saved_end = true;
}
/* first argument */
@@ -3860,6 +3884,8 @@ do_bindtextdomain(int nargs)
the_result = bindtextdomain(domain, directory);
DEREF(t1);
+ if (saved_end)
+ t2->stptr[t2->stlen] = save;
if (t2 != NULL)
DEREF(t2);
diff --git a/debug.c b/debug.c
index 6e3082d7..9568c0a1 100644
--- a/debug.c
+++ b/debug.c
@@ -898,7 +898,7 @@ do_info(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
for (i = 0; i < d->num_subs; i++) {
NODE *sub;
sub = d->subs[i];
- gprintf(out_fp, "[\"%s\"]", sub->stptr);
+ gprintf(out_fp, "[\"%.*s\"]", (int) sub->stlen, sub->stptr);
}
gprintf(out_fp, "\n");
} else if (IS_FIELD(d))
@@ -1090,7 +1090,7 @@ print_array(volatile NODE *arr, char *arr_name)
if (r->type == Node_var_array)
ret = print_array(r, r->vname);
else {
- gprintf(out_fp, "%s[\"%s\"] = ", arr_name, subs->stptr);
+ gprintf(out_fp, "%s[\"%.*s\"] = ", arr_name, (int) subs->stlen, subs->stptr);
valinfo((NODE *) r, gprintf, out_fp);
}
}
@@ -1116,7 +1116,7 @@ print_subscript(NODE *arr, char *arr_name, CMDARG *a, int count)
subs = a->a_node;
r = in_array(arr, subs);
if (r == NULL)
- fprintf(out_fp, _("[\"%s\"] not in array `%s'\n"), subs->stptr, arr_name);
+ fprintf(out_fp, _("[\"%.*s\"] not in array `%s'\n"), (int) subs->stlen, subs->stptr, arr_name);
else if (r->type == Node_var_array) {
if (count > 1)
print_subscript(r, r->vname, a->next, count - 1);
@@ -1126,7 +1126,7 @@ print_subscript(NODE *arr, char *arr_name, CMDARG *a, int count)
print_symbol(r, false);
}
} else {
- fprintf(out_fp, "%s[\"%s\"] = ", arr_name, subs->stptr);
+ fprintf(out_fp, "%s[\"%.*s\"] = ", arr_name, (int) subs->stlen, subs->stptr);
valinfo(r, fprintf, out_fp);
}
}
@@ -1168,12 +1168,12 @@ do_print_var(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
subs = a->a_node;
value = in_array(r, subs);
if (value == NULL) {
- fprintf(out_fp, _("[\"%s\"] not in array `%s'\n"),
- subs->stptr, name);
+ fprintf(out_fp, _("[\"%.*s\"] not in array `%s'\n"),
+ (int) subs->stlen, subs->stptr, name);
break;
} else if (value->type != Node_var_array) {
- fprintf(out_fp, _("`%s[\"%s\"]' is not an array\n"),
- name, subs->stptr);
+ fprintf(out_fp, _("`%s[\"%.*s\"]' is not an array\n"),
+ name, (int) subs->stlen, subs->stptr);
break;
} else {
r = value;
@@ -1255,15 +1255,15 @@ do_set_var(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
if (count == 1) {
if (value != NULL && value->type == Node_var_array)
- d_error(_("attempt to use array `%s[\"%s\"]' in a scalar context"),
- name, subs->stptr);
+ d_error(_("attempt to use array `%s[\".*%s\"]' in a scalar context"),
+ name, (int) subs->stlen, subs->stptr);
else {
arg = arg->next;
val = arg->a_node;
lhs = assoc_lookup(r, subs);
unref(*lhs);
*lhs = dupnode(val);
- fprintf(out_fp, "%s[\"%s\"] = ", name, subs->stptr);
+ fprintf(out_fp, "%s[\"%.*s\"] = ", name, (int) subs->stlen, subs->stptr);
valinfo(*lhs, fprintf, out_fp);
}
} else {
@@ -1277,8 +1277,8 @@ do_set_var(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
*lhs = array;
r = array;
} else if (value->type != Node_var_array) {
- d_error(_("attempt to use scalar `%s[\"%s\"]' as array"),
- name, subs->stptr);
+ d_error(_("attempt to use scalar `%s[\".*%s\"]' as array"),
+ name, (int) subs->stlen, subs->stptr);
break;
} else {
r = value;
@@ -1525,8 +1525,8 @@ display(struct list_item *d)
sub = d->subs[i];
r = in_array(symbol, sub);
if (r == NULL) {
- fprintf(out_fp, _("%d: [\"%s\"] not in array `%s'\n"),
- d->number, sub->stptr, d->sname);
+ fprintf(out_fp, _("%d: [\"%.*s\"] not in array `%s'\n"),
+ d->number, (int) sub->stlen, sub->stptr, d->sname);
break;
}
if (r->type == Node_var_array) {
@@ -1536,8 +1536,8 @@ display(struct list_item *d)
} else {
if (i != count - 1)
return; /* FIXME msg and delete item ? */
- fprintf(out_fp, "%d: %s[\"%s\"] = ", d->number,
- d->sname, sub->stptr);
+ fprintf(out_fp, "%d: %s[\"%.*s\"] = ", d->number,
+ d->sname, (int) sub->stlen, sub->stptr);
valinfo(r, fprintf, out_fp);
}
}
@@ -1822,7 +1822,7 @@ do_watch(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
fprintf(out_fp, "%s", w->sname);
for (i = 0; i < w->num_subs; i++) {
sub = w->subs[i];
- fprintf(out_fp, "[\"%s\"]", sub->stptr);
+ fprintf(out_fp, "[\"%.*s\"]", (int) sub->stlen, sub->stptr);
}
fprintf(out_fp, "\n");
} else if (IS_FIELD(w))
@@ -3411,7 +3411,7 @@ print_watch_item(struct list_item *w)
fprintf(out_fp, "%s", w->sname);
for (i = 0; i < w->num_subs; i++) {
sub = w->subs[i];
- fprintf(out_fp, "[\"%s\"]", sub->stptr);
+ fprintf(out_fp, "[\"%.*s\"]", (int) sub->stlen, sub->stptr);
}
fprintf(out_fp, "\n");
} else if (IS_FIELD(w))
@@ -4326,8 +4326,9 @@ serialize_subscript(char *buf, int buflen, struct list_item *item)
bl = nchar;
for (i = 0; i < item->num_subs; i++) {
sub = item->subs[i];
- nchar = snprintf(buf + bl, buflen - bl, "%lu%c%s%c",
- (unsigned long) sub->stlen, FSEP, sub->stptr, FSEP);
+ nchar = snprintf(buf + bl, buflen - bl, "%lu%c%.*s%c",
+ (unsigned long) sub->stlen, FSEP,
+ (int) sub->stlen, sub->stptr, FSEP);
if (nchar <= 0)
return 0;
bl += nchar;
@@ -5038,19 +5039,19 @@ do_print_f(CMDARG *arg, int cmd ATTRIBUTE_UNUSED)
if (value == NULL)
tmp[i] = Nnull_string; /* FIXME: goto done ? */
else if (value->type == Node_var_array) {
- d_error(_("attempt to use array `%s[\"%s\"]' in a scalar context"),
- name, subs->stptr);
+ d_error(_("attempt to use array `%s[\"%.*s\"]' in a scalar context"),
+ name, (int) subs->stlen, subs->stptr);
goto done;
} else
tmp[i] = value;
} else {
if (value == NULL) {
- d_error(_("[\"%s\"] not in array `%s'"),
- subs->stptr, name);
+ d_error(_("[\"%.*s\"] not in array `%s'"),
+ (int) subs->stlen, subs->stptr, name);
goto done;
} else if (value->type != Node_var_array) {
- d_error(_("attempt to use scalar `%s[\"%s\"]' as array"),
- name, subs->stptr);
+ d_error(_("attempt to use scalar `%s[\"%.*s\"]' as array"),
+ name, (int) subs->stlen, subs->stptr);
goto done;
} else {
r = value;
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 2e3eef12..07c82d59 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,9 @@
+2016-11-18 Arnold D. Robbins <arnold@skeeve.com>
+
+ * gawktexi.in (Variable Typing): Rework and improve discussion
+ of strings, numbers, and strnums. Update description of strnum
+ in other places.
+
2016-11-10 Arnold D. Robbins <arnold@skeeve.com>
* gawktexi.in: Fix example use of dcngegttext.
diff --git a/doc/gawk.info b/doc/gawk.info
index 264b7054..412edf51 100644
--- a/doc/gawk.info
+++ b/doc/gawk.info
@@ -8414,11 +8414,72 @@ File: gawk.info, Node: Variable Typing, Next: Comparison Operators, Up: Typin
6.3.2.1 String Type versus Numeric Type
.......................................
-The POSIX standard introduced the concept of a "numeric string", which
-is simply a string that looks like a number--for example, '" +2"'. This
-concept is used for determining the type of a variable. The type of the
-variable is important because the types of two variables determine how
-they are compared. Variable typing follows these rules:
+Scalar objects in 'awk' (variables, array elements, and fields) are
+_dynamically_ typed. This means their type can change as the program
+runs, from "untyped" before any use,(1) to string or number, and then
+from string to number or number to string, as the program progresses.
+
+ You can't do much with untyped variables, other than tell that they
+are untyped. The following program tests 'a' against '""' and '0'; the
+test succeeds when 'a' has never been assigned a value. It also uses
+the built-in 'typeof()' function (not presented yet; *note Type
+Functions::) to show 'a''s type:
+
+ $ gawk 'BEGIN { print (a == "" && a == 0 ?
+ > "a is untyped" : "a has a type!") ; print typeof(a) }'
+ -| a is untyped
+ -| unassigned
+
+ A scalar has numeric type when assigned a numeric value, such as from
+a numeric constant, or from another scalar with numeric type:
+
+ $ gawk 'BEGIN { a = 42 ; print typeof(a)
+ > b = a ; print typeof(b) }'
+ number
+ number
+
+ Similarly, a scalar has string type when assigned a string value,
+such as from a string constant, or from another scalar with string type:
+
+ $ gawk 'BEGIN { a = "forty two" ; print typeof(a)
+ > b = a ; print typeof(b) }'
+ string
+ string
+
+ So far, this is all simple and straightforward. What happens,
+though, when 'awk' has to process data from a user? Let's start with
+field data. What should the following command produce as output?
+
+ echo hello | awk '{ printf("%s %s < 42\n", $1,
+ ($1 < 42 ? "is" : "is not")) }'
+
+Since 'hello' is alphabetic data, 'awk' can only do a string comparison.
+Internally, it converts '42' into '"42"' and compares the two string
+values '"hello"' and '"42"'. Here's the result:
+
+ $ echo hello | awk '{ printf("%s %s < 42\n", $1,
+ > ($1 < 42 ? "is" : "is not")) }'
+ -| hello is not < 42
+
+ However, what happens when data from a user _looks like_ a number?
+On the one hand, in reality, the input data consists of characters, not
+binary numeric values. But, on the other hand, the data looks numeric,
+and 'awk' really ought to treat it as such. And indeed, it does:
+
+ $ echo 37 | awk '{ printf("%s %s < 42\n", $1,
+ > ($1 < 42 ? "is" : "is not")) }'
+ -| 37 is < 42
+
+ Here are the rules for when 'awk' treats data as a number, and for
+when it treats data as a string.
+
+ The POSIX standard uses the term "numeric string" for input data that
+looks numeric. The '37' in the previous example is a numeric string.
+So what is the type of a numeric string? Answer: numeric.
+
+ The type of a variable is important because the types of two
+variables determine how they are compared. Variable typing follows
+these definitions and rules:
* A numeric constant or the result of a numeric operation has the
"numeric" attribute.
@@ -8429,8 +8490,9 @@ they are compared. Variable typing follows these rules:
* Fields, 'getline' input, 'FILENAME', 'ARGV' elements, 'ENVIRON'
elements, and the elements of an array created by 'match()',
'split()', and 'patsplit()' that are numeric strings have the
- "strnum" attribute. Otherwise, they have the "string" attribute.
- Uninitialized variables also have the "strnum" attribute.
+ "strnum" attribute.(2) Otherwise, they have the "string"
+ attribute. Uninitialized variables also have the "strnum"
+ attribute.
* Attributes propagate across assignments but are not changed by any
use.
@@ -8449,16 +8511,16 @@ operation:
comparison may be used. This depends upon the attributes of the
operands, according to the following symmetric matrix:
- +-------------------------------
- | STRING NUMERIC STRNUM
- -----+-------------------------------
- |
- STRING | string string string
- |
- NUMERIC | string numeric numeric
- |
- STRNUM | string numeric numeric
- -----+-------------------------------
+ +----------------------------------------------
+ | STRING NUMERIC STRNUM
+--------+----------------------------------------------
+ |
+STRING | string string string
+ |
+NUMERIC | string numeric numeric
+ |
+STRNUM | string numeric numeric
+--------+----------------------------------------------
The basic idea is that user input that looks numeric--and _only_ user
input--should be treated as numeric, even though it is actually made of
@@ -8469,16 +8531,18 @@ for comparison purposes.
In short, when one operand is a "pure" string, such as a string
constant, then a string comparison is performed. Otherwise, a numeric
-comparison is performed.
+comparison is performed. (The primary difference between a number and a
+strnum is that for strnums 'gawk' preserves the original string value
+that the scalar had when it came in.)
- This point bears additional emphasis: All user input is made of
-characters, and so is first and foremost of string type; input strings
-that look numeric are additionally given the strnum attribute. Thus,
-the six-character input string ' +3.14' receives the strnum attribute.
-In contrast, the eight characters '" +3.14"' appearing in program text
-comprise a string constant. The following examples print '1' when the
-comparison between the two different constants is true, and '0'
-otherwise:
+ This point bears additional emphasis: Input that looks numeric _is_
+numeric. All other input is treated as strings.
+
+ Thus, the six-character input string ' +3.14' receives the strnum
+attribute. In contrast, the eight characters '" +3.14"' appearing in
+program text comprise a string constant. The following examples print
+'1' when the comparison between the two different constants is true, and
+'0' otherwise:
$ echo ' +3.14' | awk '{ print($0 == " +3.14") }' True
-| 1
@@ -8497,6 +8561,19 @@ otherwise:
$ echo ' +3.14' | awk '{ print($1 == 3.14) }' True
-| 1
+ You can see the type of an input field (or other user input) using
+'typeof()':
+
+ $ echo hello 37 | gawk '{ print typeof($1), typeof($2) }'
+ -| string strnum
+
+ ---------- Footnotes ----------
+
+ (1) 'gawk' calls this "unassigned", as the following example shows.
+
+ (2) Thus, a POSIX numeric string and 'gawk''s strnum are the same
+thing.
+

File: gawk.info, Node: Comparison Operators, Next: POSIX String Comparison, Prev: Variable Typing, Up: Typing and Comparison
@@ -13618,8 +13695,8 @@ contexts.
X is a string.
'"strnum"'
- X is a string that might be a number, such as a field or the
- result of calling 'split()'. (I.e., X has the STRNUM
+ X is a number that started life as user input, such as a field
+ or the result of calling 'split()'. (I.e., X has the strnum
attribute; *note Variable Typing::.)
'"unassigned"'
@@ -13627,8 +13704,9 @@ contexts.
For example:
BEGIN {
- a[1] # creates a[1] but it has no assigned value
- print typeof(a[1]) # scalar_u
+ # creates a[1] but it has no assigned value
+ a[1]
+ print typeof(a[1]) # unassigned
}
'"untyped"'
@@ -22100,62 +22178,6 @@ some limitations. A few that it's worth being aware of are:
* The 'gawk' debugger only accepts source code supplied with the '-f'
option.
- One other point is worth discussing. Conventional debuggers run in a
-separate process (and thus address space) from the programs that they
-debug (the "debuggee", if you will).
-
- The 'gawk' debugger is different; it is an integrated part of 'gawk'
-itself. This makes it possible, in rare cases, for 'gawk' to become an
-excellent demonstrator of Heisenberg Uncertainty physics, where the mere
-act of observing something can change it. Consider the following:(1)
-
- $ cat test.awk
- -| { print typeof($1), typeof($2) }
- $ cat test.data
- -| abc 123
- $ gawk -f test.awk test.data
- -| strnum strnum
-
- This is all as expected: field data has the STRNUM attribute (*note
-Variable Typing::). Now watch what happens when we run this program
-under the debugger:
-
- $ gawk -D -f test.awk test.data
- gawk> w $1 Set watchpoint on $1
- -| Watchpoint 1: $1
- gawk> w $2 Set watchpoint on $2
- -| Watchpoint 2: $2
- gawk> r Start the program
- -| Starting program:
- -| Stopping in Rule ...
- -| Watchpoint 1: $1 Watchpoint fires
- -| Old value: ""
- -| New value: "abc"
- -| main() at `test.awk':1
- -| 1 { print typeof($1), typeof($2) }
- gawk> n Keep going ...
- -| Watchpoint 2: $2 Watchpoint fires
- -| Old value: ""
- -| New value: "123"
- -| main() at `test.awk':1
- -| 1 { print typeof($1), typeof($2) }
- gawk> n Get result from typeof()
- -| strnum number Result for $2 isn't right
- -| Program exited normally with exit value: 0
- gawk> quit
-
- In this case, the act of comparing the new value of '$2' with the old
-one caused 'gawk' to evaluate it and determine that it is indeed a
-number, and this is reflected in the result of 'typeof()'.
-
- Cases like this where the debugger is not transparent to the
-program's execution should be rare. If you encounter one, please report
-it (*note Bugs::).
-
- ---------- Footnotes ----------
-
- (1) Thanks to Hermann Peifer for this example.
-

File: gawk.info, Node: Debugging Summary, Prev: Limitations, Up: Debugger
@@ -23436,16 +23458,25 @@ operations:
* The API defines several simple 'struct's that map values as seen
from 'awk'. A value can be a 'double', a string, or an array (as
- in multidimensional arrays, or when creating a new array). String
- values maintain both pointer and length, because embedded NUL
- characters are allowed.
+ in multidimensional arrays, or when creating a new array).
- NOTE: By intent, strings are maintained using the current
+ String values maintain both pointer and length, because embedded
+ NUL characters are allowed.
+
+ NOTE: By intent, 'gawk' maintains strings using the current
multibyte encoding (as defined by 'LC_XXX' environment
variables) and not using wide characters. This matches how
'gawk' stores strings internally and also how characters are
likely to be input into and output from files.
+ NOTE: String values passed to an extension by 'gawk' are
+ always NUL-terminated. Thus it is safe to pass such string
+ values to standard library and system routines. However,
+ because 'gawk' allows embedded NUL characters in string data,
+ you should check that 'strlen(SOME_STRING)' matches the length
+ for that string passed to the extension before using it as a
+ regular C string.
+
* When retrieving a value (such as a parameter or that of a global
variable or array element), the extension requests a specific type
(number, string, scalar, value cookie, array, or "undefined").
@@ -34229,7 +34260,7 @@ Index
* numeric constants: Scalar Constants. (line 6)
* numeric functions: Numeric Functions. (line 6)
* numeric, output format: OFMT. (line 6)
-* numeric, strings: Variable Typing. (line 6)
+* numeric, strings: Variable Typing. (line 65)
* o debugger command (alias for option): Debugger Info. (line 57)
* obsolete features: Obsolete. (line 6)
* octal numbers: Nondecimal-numbers. (line 6)
@@ -34425,7 +34456,7 @@ Index
* POSIX awk, GNU long options and: Options. (line 15)
* POSIX awk, interval expressions in: Regexp Operators. (line 135)
* POSIX awk, next/nextfile statements and: Next Statement. (line 44)
-* POSIX awk, numeric strings and: Variable Typing. (line 6)
+* POSIX awk, numeric strings and: Variable Typing. (line 65)
* POSIX awk, OFMT variable and: OFMT. (line 27)
* POSIX awk, OFMT variable and <1>: Strings And Numbers. (line 56)
* POSIX awk, period (.), using: Regexp Operators. (line 51)
@@ -34935,7 +34966,7 @@ Index
* strings, merging arrays into: Join Function. (line 6)
* strings, null: Regexp Field Splitting.
(line 43)
-* strings, numeric: Variable Typing. (line 6)
+* strings, numeric: Variable Typing. (line 65)
* strtonum: String Functions. (line 391)
* strtonum() function (gawk), --non-decimal-data option and: Nondecimal Data.
(line 35)
@@ -35378,406 +35409,407 @@ Node: Truth Values and Conditions361798
Node: Truth Values362872
Node: Typing and Comparison363920
Node: Variable Typing364740
-Node: Comparison Operators368364
-Ref: table-relational-ops368783
-Node: POSIX String Comparison372278
-Ref: POSIX String Comparison-Footnote-1373973
-Ref: POSIX String Comparison-Footnote-2374112
-Node: Boolean Ops374196
-Ref: Boolean Ops-Footnote-1378678
-Node: Conditional Exp378770
-Node: Function Calls380506
-Node: Precedence384383
-Node: Locales388042
-Node: Expressions Summary389674
-Node: Patterns and Actions392247
-Node: Pattern Overview393367
-Node: Regexp Patterns395044
-Node: Expression Patterns395586
-Node: Ranges399367
-Node: BEGIN/END402475
-Node: Using BEGIN/END403236
-Ref: Using BEGIN/END-Footnote-1405972
-Node: I/O And BEGIN/END406078
-Node: BEGINFILE/ENDFILE408392
-Node: Empty411299
-Node: Using Shell Variables411616
-Node: Action Overview413890
-Node: Statements416215
-Node: If Statement418063
-Node: While Statement419558
-Node: Do Statement421586
-Node: For Statement422734
-Node: Switch Statement425892
-Node: Break Statement428278
-Node: Continue Statement430370
-Node: Next Statement432197
-Node: Nextfile Statement434580
-Node: Exit Statement437232
-Node: Built-in Variables439635
-Node: User-modified440768
-Node: Auto-set448354
-Ref: Auto-set-Footnote-1463007
-Ref: Auto-set-Footnote-2463213
-Node: ARGC and ARGV463269
-Node: Pattern Action Summary467482
-Node: Arrays469912
-Node: Array Basics471241
-Node: Array Intro472085
-Ref: figure-array-elements474060
-Ref: Array Intro-Footnote-1476764
-Node: Reference to Elements476892
-Node: Assigning Elements479356
-Node: Array Example479847
-Node: Scanning an Array481606
-Node: Controlling Scanning484628
-Ref: Controlling Scanning-Footnote-1490027
-Node: Numeric Array Subscripts490343
-Node: Uninitialized Subscripts492527
-Node: Delete494146
-Ref: Delete-Footnote-1496898
-Node: Multidimensional496955
-Node: Multiscanning500050
-Node: Arrays of Arrays501641
-Node: Arrays Summary506408
-Node: Functions508501
-Node: Built-in509539
-Node: Calling Built-in510620
-Node: Numeric Functions512616
-Ref: Numeric Functions-Footnote-1517449
-Ref: Numeric Functions-Footnote-2517806
-Ref: Numeric Functions-Footnote-3517854
-Node: String Functions518126
-Ref: String Functions-Footnote-1541630
-Ref: String Functions-Footnote-2541758
-Ref: String Functions-Footnote-3542006
-Node: Gory Details542093
-Ref: table-sub-escapes543884
-Ref: table-sub-proposed545403
-Ref: table-posix-sub546766
-Ref: table-gensub-escapes548307
-Ref: Gory Details-Footnote-1549130
-Node: I/O Functions549284
-Ref: table-system-return-values555866
-Ref: I/O Functions-Footnote-1557846
-Ref: I/O Functions-Footnote-2557994
-Node: Time Functions558114
-Ref: Time Functions-Footnote-1568636
-Ref: Time Functions-Footnote-2568704
-Ref: Time Functions-Footnote-3568862
-Ref: Time Functions-Footnote-4568973
-Ref: Time Functions-Footnote-5569085
-Ref: Time Functions-Footnote-6569312
-Node: Bitwise Functions569578
-Ref: table-bitwise-ops570172
-Ref: Bitwise Functions-Footnote-1576198
-Ref: Bitwise Functions-Footnote-2576371
-Node: Type Functions576562
-Node: I18N Functions579094
-Node: User-defined580745
-Node: Definition Syntax581550
-Ref: Definition Syntax-Footnote-1587237
-Node: Function Example587308
-Ref: Function Example-Footnote-1590230
-Node: Function Caveats590252
-Node: Calling A Function590770
-Node: Variable Scope591728
-Node: Pass By Value/Reference594722
-Node: Return Statement598221
-Node: Dynamic Typing601200
-Node: Indirect Calls602130
-Ref: Indirect Calls-Footnote-1612381
-Node: Functions Summary612509
-Node: Library Functions615214
-Ref: Library Functions-Footnote-1618821
-Ref: Library Functions-Footnote-2618964
-Node: Library Names619135
-Ref: Library Names-Footnote-1622595
-Ref: Library Names-Footnote-2622818
-Node: General Functions622904
-Node: Strtonum Function624007
-Node: Assert Function627029
-Node: Round Function630355
-Node: Cliff Random Function631896
-Node: Ordinal Functions632912
-Ref: Ordinal Functions-Footnote-1635975
-Ref: Ordinal Functions-Footnote-2636227
-Node: Join Function636437
-Ref: Join Function-Footnote-1638207
-Node: Getlocaltime Function638407
-Node: Readfile Function642149
-Node: Shell Quoting644121
-Node: Data File Management645522
-Node: Filetrans Function646154
-Node: Rewind Function650250
-Node: File Checking652156
-Ref: File Checking-Footnote-1653490
-Node: Empty Files653691
-Node: Ignoring Assigns655670
-Node: Getopt Function657220
-Ref: Getopt Function-Footnote-1668689
-Node: Passwd Functions668889
-Ref: Passwd Functions-Footnote-1677728
-Node: Group Functions677816
-Ref: Group Functions-Footnote-1685714
-Node: Walking Arrays685921
-Node: Library Functions Summary688929
-Node: Library Exercises690335
-Node: Sample Programs690800
-Node: Running Examples691570
-Node: Clones692298
-Node: Cut Program693522
-Node: Egrep Program703451
-Ref: Egrep Program-Footnote-1710963
-Node: Id Program711073
-Node: Split Program714753
-Ref: Split Program-Footnote-1718212
-Node: Tee Program718341
-Node: Uniq Program721131
-Node: Wc Program728557
-Ref: Wc Program-Footnote-1732812
-Node: Miscellaneous Programs732906
-Node: Dupword Program734119
-Node: Alarm Program736149
-Node: Translate Program741004
-Ref: Translate Program-Footnote-1745569
-Node: Labels Program745839
-Ref: Labels Program-Footnote-1749190
-Node: Word Sorting749274
-Node: History Sorting753346
-Node: Extract Program755181
-Node: Simple Sed762710
-Node: Igawk Program765784
-Ref: Igawk Program-Footnote-1780115
-Ref: Igawk Program-Footnote-2780317
-Ref: Igawk Program-Footnote-3780439
-Node: Anagram Program780554
-Node: Signature Program783616
-Node: Programs Summary784863
-Node: Programs Exercises786077
-Ref: Programs Exercises-Footnote-1790206
-Node: Advanced Features790297
-Node: Nondecimal Data792287
-Node: Array Sorting793878
-Node: Controlling Array Traversal794578
-Ref: Controlling Array Traversal-Footnote-1802945
-Node: Array Sorting Functions803063
-Ref: Array Sorting Functions-Footnote-1808154
-Node: Two-way I/O808350
-Ref: Two-way I/O-Footnote-1814900
-Ref: Two-way I/O-Footnote-2815087
-Node: TCP/IP Networking815169
-Node: Profiling818287
-Ref: Profiling-Footnote-1826780
-Node: Advanced Features Summary827103
-Node: Internationalization828947
-Node: I18N and L10N830427
-Node: Explaining gettext831114
-Ref: Explaining gettext-Footnote-1837006
-Ref: Explaining gettext-Footnote-2837191
-Node: Programmer i18n837356
-Ref: Programmer i18n-Footnote-1842305
-Node: Translator i18n842354
-Node: String Extraction843148
-Ref: String Extraction-Footnote-1844280
-Node: Printf Ordering844366
-Ref: Printf Ordering-Footnote-1847152
-Node: I18N Portability847216
-Ref: I18N Portability-Footnote-1849672
-Node: I18N Example849735
-Ref: I18N Example-Footnote-1852541
-Node: Gawk I18N852614
-Node: I18N Summary853259
-Node: Debugger854600
-Node: Debugging855622
-Node: Debugging Concepts856063
-Node: Debugging Terms857872
-Node: Awk Debugging860447
-Node: Sample Debugging Session861353
-Node: Debugger Invocation861887
-Node: Finding The Bug863273
-Node: List of Debugger Commands869751
-Node: Breakpoint Control871084
-Node: Debugger Execution Control874778
-Node: Viewing And Changing Data878140
-Node: Execution Stack881514
-Node: Debugger Info883151
-Node: Miscellaneous Debugger Commands887222
-Node: Readline Support892310
-Node: Limitations893206
-Ref: Limitations-Footnote-1897437
-Node: Debugging Summary897488
-Node: Arbitrary Precision Arithmetic898767
-Node: Computer Arithmetic900183
-Ref: table-numeric-ranges903774
-Ref: Computer Arithmetic-Footnote-1904496
-Node: Math Definitions904553
-Ref: table-ieee-formats907867
-Ref: Math Definitions-Footnote-1908470
-Node: MPFR features908575
-Node: FP Math Caution910292
-Ref: FP Math Caution-Footnote-1911364
-Node: Inexactness of computations911733
-Node: Inexact representation912693
-Node: Comparing FP Values914053
-Node: Errors accumulate915135
-Node: Getting Accuracy916568
-Node: Try To Round919278
-Node: Setting precision920177
-Ref: table-predefined-precision-strings920874
-Node: Setting the rounding mode922704
-Ref: table-gawk-rounding-modes923078
-Ref: Setting the rounding mode-Footnote-1926486
-Node: Arbitrary Precision Integers926665
-Ref: Arbitrary Precision Integers-Footnote-1931582
-Node: POSIX Floating Point Problems931731
-Ref: POSIX Floating Point Problems-Footnote-1935613
-Node: Floating point summary935651
-Node: Dynamic Extensions937841
-Node: Extension Intro939394
-Node: Plugin License940660
-Node: Extension Mechanism Outline941457
-Ref: figure-load-extension941896
-Ref: figure-register-new-function943461
-Ref: figure-call-new-function944553
-Node: Extension API Description946615
-Node: Extension API Functions Introduction948147
-Node: General Data Types953006
-Ref: General Data Types-Footnote-1958961
-Node: Memory Allocation Functions959260
-Ref: Memory Allocation Functions-Footnote-1962105
-Node: Constructor Functions962204
-Node: Registration Functions963949
-Node: Extension Functions964634
-Node: Exit Callback Functions967257
-Node: Extension Version String968507
-Node: Input Parsers969170
-Node: Output Wrappers979052
-Node: Two-way processors983564
-Node: Printing Messages985829
-Ref: Printing Messages-Footnote-1987000
-Node: Updating ERRNO987153
-Node: Requesting Values987892
-Ref: table-value-types-returned988629
-Node: Accessing Parameters989512
-Node: Symbol Table Access990747
-Node: Symbol table by name991259
-Node: Symbol table by cookie993280
-Ref: Symbol table by cookie-Footnote-1997432
-Node: Cached values997496
-Ref: Cached values-Footnote-11001003
-Node: Array Manipulation1001094
-Ref: Array Manipulation-Footnote-11002185
-Node: Array Data Types1002222
-Ref: Array Data Types-Footnote-11004880
-Node: Array Functions1004972
-Node: Flattening Arrays1008830
-Node: Creating Arrays1015738
-Node: Redirection API1020507
-Node: Extension API Variables1023338
-Node: Extension Versioning1023971
-Ref: gawk-api-version1024408
-Node: Extension API Informational Variables1026164
-Node: Extension API Boilerplate1027228
-Node: Finding Extensions1031042
-Node: Extension Example1031601
-Node: Internal File Description1032399
-Node: Internal File Ops1036479
-Ref: Internal File Ops-Footnote-11048241
-Node: Using Internal File Ops1048381
-Ref: Using Internal File Ops-Footnote-11050764
-Node: Extension Samples1051038
-Node: Extension Sample File Functions1052567
-Node: Extension Sample Fnmatch1060216
-Node: Extension Sample Fork1061703
-Node: Extension Sample Inplace1062921
-Node: Extension Sample Ord1066131
-Node: Extension Sample Readdir1066967
-Ref: table-readdir-file-types1067856
-Node: Extension Sample Revout1068661
-Node: Extension Sample Rev2way1069250
-Node: Extension Sample Read write array1069990
-Node: Extension Sample Readfile1071932
-Node: Extension Sample Time1073027
-Node: Extension Sample API Tests1074375
-Node: gawkextlib1074867
-Node: Extension summary1077314
-Node: Extension Exercises1081016
-Node: Language History1082514
-Node: V7/SVR3.11084170
-Node: SVR41086322
-Node: POSIX1087756
-Node: BTL1089135
-Node: POSIX/GNU1089864
-Node: Feature History1095726
-Node: Common Extensions1110096
-Node: Ranges and Locales1111379
-Ref: Ranges and Locales-Footnote-11115995
-Ref: Ranges and Locales-Footnote-21116022
-Ref: Ranges and Locales-Footnote-31116257
-Node: Contributors1116478
-Node: History summary1122038
-Node: Installation1123418
-Node: Gawk Distribution1124362
-Node: Getting1124846
-Node: Extracting1125807
-Node: Distribution contents1127445
-Node: Unix Installation1133530
-Node: Quick Installation1134212
-Node: Shell Startup Files1136626
-Node: Additional Configuration Options1137704
-Node: Configuration Philosophy1139509
-Node: Non-Unix Installation1141878
-Node: PC Installation1142338
-Node: PC Binary Installation1143176
-Node: PC Compiling1143611
-Node: PC Using1144728
-Node: Cygwin1147773
-Node: MSYS1148543
-Node: VMS Installation1149044
-Node: VMS Compilation1149835
-Ref: VMS Compilation-Footnote-11151064
-Node: VMS Dynamic Extensions1151122
-Node: VMS Installation Details1152807
-Node: VMS Running1155060
-Node: VMS GNV1159339
-Node: VMS Old Gawk1160074
-Node: Bugs1160545
-Node: Bug address1161208
-Node: Usenet1163605
-Node: Maintainers1164380
-Node: Other Versions1165756
-Node: Installation summary1172340
-Node: Notes1173375
-Node: Compatibility Mode1174240
-Node: Additions1175022
-Node: Accessing The Source1175947
-Node: Adding Code1177382
-Node: New Ports1183601
-Node: Derived Files1188089
-Ref: Derived Files-Footnote-11193574
-Ref: Derived Files-Footnote-21193609
-Ref: Derived Files-Footnote-31194207
-Node: Future Extensions1194321
-Node: Implementation Limitations1194979
-Node: Extension Design1196162
-Node: Old Extension Problems1197316
-Ref: Old Extension Problems-Footnote-11198834
-Node: Extension New Mechanism Goals1198891
-Ref: Extension New Mechanism Goals-Footnote-11202255
-Node: Extension Other Design Decisions1202444
-Node: Extension Future Growth1204557
-Node: Old Extension Mechanism1205393
-Node: Notes summary1207156
-Node: Basic Concepts1208338
-Node: Basic High Level1209019
-Ref: figure-general-flow1209301
-Ref: figure-process-flow1209986
-Ref: Basic High Level-Footnote-11213287
-Node: Basic Data Typing1213472
-Node: Glossary1216800
-Node: Copying1248747
-Node: GNU Free Documentation License1286286
-Node: Index1311404
+Ref: Variable Typing-Footnote-1371094
+Ref: Variable Typing-Footnote-2371166
+Node: Comparison Operators371243
+Ref: table-relational-ops371662
+Node: POSIX String Comparison375157
+Ref: POSIX String Comparison-Footnote-1376852
+Ref: POSIX String Comparison-Footnote-2376991
+Node: Boolean Ops377075
+Ref: Boolean Ops-Footnote-1381557
+Node: Conditional Exp381649
+Node: Function Calls383385
+Node: Precedence387262
+Node: Locales390921
+Node: Expressions Summary392553
+Node: Patterns and Actions395126
+Node: Pattern Overview396246
+Node: Regexp Patterns397923
+Node: Expression Patterns398465
+Node: Ranges402246
+Node: BEGIN/END405354
+Node: Using BEGIN/END406115
+Ref: Using BEGIN/END-Footnote-1408851
+Node: I/O And BEGIN/END408957
+Node: BEGINFILE/ENDFILE411271
+Node: Empty414178
+Node: Using Shell Variables414495
+Node: Action Overview416769
+Node: Statements419094
+Node: If Statement420942
+Node: While Statement422437
+Node: Do Statement424465
+Node: For Statement425613
+Node: Switch Statement428771
+Node: Break Statement431157
+Node: Continue Statement433249
+Node: Next Statement435076
+Node: Nextfile Statement437459
+Node: Exit Statement440111
+Node: Built-in Variables442514
+Node: User-modified443647
+Node: Auto-set451233
+Ref: Auto-set-Footnote-1465886
+Ref: Auto-set-Footnote-2466092
+Node: ARGC and ARGV466148
+Node: Pattern Action Summary470361
+Node: Arrays472791
+Node: Array Basics474120
+Node: Array Intro474964
+Ref: figure-array-elements476939
+Ref: Array Intro-Footnote-1479643
+Node: Reference to Elements479771
+Node: Assigning Elements482235
+Node: Array Example482726
+Node: Scanning an Array484485
+Node: Controlling Scanning487507
+Ref: Controlling Scanning-Footnote-1492906
+Node: Numeric Array Subscripts493222
+Node: Uninitialized Subscripts495406
+Node: Delete497025
+Ref: Delete-Footnote-1499777
+Node: Multidimensional499834
+Node: Multiscanning502929
+Node: Arrays of Arrays504520
+Node: Arrays Summary509287
+Node: Functions511380
+Node: Built-in512418
+Node: Calling Built-in513499
+Node: Numeric Functions515495
+Ref: Numeric Functions-Footnote-1520328
+Ref: Numeric Functions-Footnote-2520685
+Ref: Numeric Functions-Footnote-3520733
+Node: String Functions521005
+Ref: String Functions-Footnote-1544509
+Ref: String Functions-Footnote-2544637
+Ref: String Functions-Footnote-3544885
+Node: Gory Details544972
+Ref: table-sub-escapes546763
+Ref: table-sub-proposed548282
+Ref: table-posix-sub549645
+Ref: table-gensub-escapes551186
+Ref: Gory Details-Footnote-1552009
+Node: I/O Functions552163
+Ref: table-system-return-values558745
+Ref: I/O Functions-Footnote-1560725
+Ref: I/O Functions-Footnote-2560873
+Node: Time Functions560993
+Ref: Time Functions-Footnote-1571515
+Ref: Time Functions-Footnote-2571583
+Ref: Time Functions-Footnote-3571741
+Ref: Time Functions-Footnote-4571852
+Ref: Time Functions-Footnote-5571964
+Ref: Time Functions-Footnote-6572191
+Node: Bitwise Functions572457
+Ref: table-bitwise-ops573051
+Ref: Bitwise Functions-Footnote-1579077
+Ref: Bitwise Functions-Footnote-2579250
+Node: Type Functions579441
+Node: I18N Functions581988
+Node: User-defined583639
+Node: Definition Syntax584444
+Ref: Definition Syntax-Footnote-1590131
+Node: Function Example590202
+Ref: Function Example-Footnote-1593124
+Node: Function Caveats593146
+Node: Calling A Function593664
+Node: Variable Scope594622
+Node: Pass By Value/Reference597616
+Node: Return Statement601115
+Node: Dynamic Typing604094
+Node: Indirect Calls605024
+Ref: Indirect Calls-Footnote-1615275
+Node: Functions Summary615403
+Node: Library Functions618108
+Ref: Library Functions-Footnote-1621715
+Ref: Library Functions-Footnote-2621858
+Node: Library Names622029
+Ref: Library Names-Footnote-1625489
+Ref: Library Names-Footnote-2625712
+Node: General Functions625798
+Node: Strtonum Function626901
+Node: Assert Function629923
+Node: Round Function633249
+Node: Cliff Random Function634790
+Node: Ordinal Functions635806
+Ref: Ordinal Functions-Footnote-1638869
+Ref: Ordinal Functions-Footnote-2639121
+Node: Join Function639331
+Ref: Join Function-Footnote-1641101
+Node: Getlocaltime Function641301
+Node: Readfile Function645043
+Node: Shell Quoting647015
+Node: Data File Management648416
+Node: Filetrans Function649048
+Node: Rewind Function653144
+Node: File Checking655050
+Ref: File Checking-Footnote-1656384
+Node: Empty Files656585
+Node: Ignoring Assigns658564
+Node: Getopt Function660114
+Ref: Getopt Function-Footnote-1671583
+Node: Passwd Functions671783
+Ref: Passwd Functions-Footnote-1680622
+Node: Group Functions680710
+Ref: Group Functions-Footnote-1688608
+Node: Walking Arrays688815
+Node: Library Functions Summary691823
+Node: Library Exercises693229
+Node: Sample Programs693694
+Node: Running Examples694464
+Node: Clones695192
+Node: Cut Program696416
+Node: Egrep Program706345
+Ref: Egrep Program-Footnote-1713857
+Node: Id Program713967
+Node: Split Program717647
+Ref: Split Program-Footnote-1721106
+Node: Tee Program721235
+Node: Uniq Program724025
+Node: Wc Program731451
+Ref: Wc Program-Footnote-1735706
+Node: Miscellaneous Programs735800
+Node: Dupword Program737013
+Node: Alarm Program739043
+Node: Translate Program743898
+Ref: Translate Program-Footnote-1748463
+Node: Labels Program748733
+Ref: Labels Program-Footnote-1752084
+Node: Word Sorting752168
+Node: History Sorting756240
+Node: Extract Program758075
+Node: Simple Sed765604
+Node: Igawk Program768678
+Ref: Igawk Program-Footnote-1783009
+Ref: Igawk Program-Footnote-2783211
+Ref: Igawk Program-Footnote-3783333
+Node: Anagram Program783448
+Node: Signature Program786510
+Node: Programs Summary787757
+Node: Programs Exercises788971
+Ref: Programs Exercises-Footnote-1793100
+Node: Advanced Features793191
+Node: Nondecimal Data795181
+Node: Array Sorting796772
+Node: Controlling Array Traversal797472
+Ref: Controlling Array Traversal-Footnote-1805839
+Node: Array Sorting Functions805957
+Ref: Array Sorting Functions-Footnote-1811048
+Node: Two-way I/O811244
+Ref: Two-way I/O-Footnote-1817794
+Ref: Two-way I/O-Footnote-2817981
+Node: TCP/IP Networking818063
+Node: Profiling821181
+Ref: Profiling-Footnote-1829674
+Node: Advanced Features Summary829997
+Node: Internationalization831841
+Node: I18N and L10N833321
+Node: Explaining gettext834008
+Ref: Explaining gettext-Footnote-1839900
+Ref: Explaining gettext-Footnote-2840085
+Node: Programmer i18n840250
+Ref: Programmer i18n-Footnote-1845199
+Node: Translator i18n845248
+Node: String Extraction846042
+Ref: String Extraction-Footnote-1847174
+Node: Printf Ordering847260
+Ref: Printf Ordering-Footnote-1850046
+Node: I18N Portability850110
+Ref: I18N Portability-Footnote-1852566
+Node: I18N Example852629
+Ref: I18N Example-Footnote-1855435
+Node: Gawk I18N855508
+Node: I18N Summary856153
+Node: Debugger857494
+Node: Debugging858516
+Node: Debugging Concepts858957
+Node: Debugging Terms860766
+Node: Awk Debugging863341
+Node: Sample Debugging Session864247
+Node: Debugger Invocation864781
+Node: Finding The Bug866167
+Node: List of Debugger Commands872645
+Node: Breakpoint Control873978
+Node: Debugger Execution Control877672
+Node: Viewing And Changing Data881034
+Node: Execution Stack884408
+Node: Debugger Info886045
+Node: Miscellaneous Debugger Commands890116
+Node: Readline Support895204
+Node: Limitations896100
+Node: Debugging Summary898209
+Node: Arbitrary Precision Arithmetic899488
+Node: Computer Arithmetic900904
+Ref: table-numeric-ranges904495
+Ref: Computer Arithmetic-Footnote-1905217
+Node: Math Definitions905274
+Ref: table-ieee-formats908588
+Ref: Math Definitions-Footnote-1909191
+Node: MPFR features909296
+Node: FP Math Caution911013
+Ref: FP Math Caution-Footnote-1912085
+Node: Inexactness of computations912454
+Node: Inexact representation913414
+Node: Comparing FP Values914774
+Node: Errors accumulate915856
+Node: Getting Accuracy917289
+Node: Try To Round919999
+Node: Setting precision920898
+Ref: table-predefined-precision-strings921595
+Node: Setting the rounding mode923425
+Ref: table-gawk-rounding-modes923799
+Ref: Setting the rounding mode-Footnote-1927207
+Node: Arbitrary Precision Integers927386
+Ref: Arbitrary Precision Integers-Footnote-1932303
+Node: POSIX Floating Point Problems932452
+Ref: POSIX Floating Point Problems-Footnote-1936334
+Node: Floating point summary936372
+Node: Dynamic Extensions938562
+Node: Extension Intro940115
+Node: Plugin License941381
+Node: Extension Mechanism Outline942178
+Ref: figure-load-extension942617
+Ref: figure-register-new-function944182
+Ref: figure-call-new-function945274
+Node: Extension API Description947336
+Node: Extension API Functions Introduction948868
+Node: General Data Types954179
+Ref: General Data Types-Footnote-1960134
+Node: Memory Allocation Functions960433
+Ref: Memory Allocation Functions-Footnote-1963278
+Node: Constructor Functions963377
+Node: Registration Functions965122
+Node: Extension Functions965807
+Node: Exit Callback Functions968430
+Node: Extension Version String969680
+Node: Input Parsers970343
+Node: Output Wrappers980225
+Node: Two-way processors984737
+Node: Printing Messages987002
+Ref: Printing Messages-Footnote-1988173
+Node: Updating ERRNO988326
+Node: Requesting Values989065
+Ref: table-value-types-returned989802
+Node: Accessing Parameters990685
+Node: Symbol Table Access991920
+Node: Symbol table by name992432
+Node: Symbol table by cookie994453
+Ref: Symbol table by cookie-Footnote-1998605
+Node: Cached values998669
+Ref: Cached values-Footnote-11002176
+Node: Array Manipulation1002267
+Ref: Array Manipulation-Footnote-11003358
+Node: Array Data Types1003395
+Ref: Array Data Types-Footnote-11006053
+Node: Array Functions1006145
+Node: Flattening Arrays1010003
+Node: Creating Arrays1016911
+Node: Redirection API1021680
+Node: Extension API Variables1024511
+Node: Extension Versioning1025144
+Ref: gawk-api-version1025581
+Node: Extension API Informational Variables1027337
+Node: Extension API Boilerplate1028401
+Node: Finding Extensions1032215
+Node: Extension Example1032774
+Node: Internal File Description1033572
+Node: Internal File Ops1037652
+Ref: Internal File Ops-Footnote-11049414
+Node: Using Internal File Ops1049554
+Ref: Using Internal File Ops-Footnote-11051937
+Node: Extension Samples1052211
+Node: Extension Sample File Functions1053740
+Node: Extension Sample Fnmatch1061389
+Node: Extension Sample Fork1062876
+Node: Extension Sample Inplace1064094
+Node: Extension Sample Ord1067304
+Node: Extension Sample Readdir1068140
+Ref: table-readdir-file-types1069029
+Node: Extension Sample Revout1069834
+Node: Extension Sample Rev2way1070423
+Node: Extension Sample Read write array1071163
+Node: Extension Sample Readfile1073105
+Node: Extension Sample Time1074200
+Node: Extension Sample API Tests1075548
+Node: gawkextlib1076040
+Node: Extension summary1078487
+Node: Extension Exercises1082189
+Node: Language History1083687
+Node: V7/SVR3.11085343
+Node: SVR41087495
+Node: POSIX1088929
+Node: BTL1090308
+Node: POSIX/GNU1091037
+Node: Feature History1096899
+Node: Common Extensions1111269
+Node: Ranges and Locales1112552
+Ref: Ranges and Locales-Footnote-11117168
+Ref: Ranges and Locales-Footnote-21117195
+Ref: Ranges and Locales-Footnote-31117430
+Node: Contributors1117651
+Node: History summary1123211
+Node: Installation1124591
+Node: Gawk Distribution1125535
+Node: Getting1126019
+Node: Extracting1126980
+Node: Distribution contents1128618
+Node: Unix Installation1134703
+Node: Quick Installation1135385
+Node: Shell Startup Files1137799
+Node: Additional Configuration Options1138877
+Node: Configuration Philosophy1140682
+Node: Non-Unix Installation1143051
+Node: PC Installation1143511
+Node: PC Binary Installation1144349
+Node: PC Compiling1144784
+Node: PC Using1145901
+Node: Cygwin1148946
+Node: MSYS1149716
+Node: VMS Installation1150217
+Node: VMS Compilation1151008
+Ref: VMS Compilation-Footnote-11152237
+Node: VMS Dynamic Extensions1152295
+Node: VMS Installation Details1153980
+Node: VMS Running1156233
+Node: VMS GNV1160512
+Node: VMS Old Gawk1161247
+Node: Bugs1161718
+Node: Bug address1162381
+Node: Usenet1164778
+Node: Maintainers1165553
+Node: Other Versions1166929
+Node: Installation summary1173513
+Node: Notes1174548
+Node: Compatibility Mode1175413
+Node: Additions1176195
+Node: Accessing The Source1177120
+Node: Adding Code1178555
+Node: New Ports1184774
+Node: Derived Files1189262
+Ref: Derived Files-Footnote-11194747
+Ref: Derived Files-Footnote-21194782
+Ref: Derived Files-Footnote-31195380
+Node: Future Extensions1195494
+Node: Implementation Limitations1196152
+Node: Extension Design1197335
+Node: Old Extension Problems1198489
+Ref: Old Extension Problems-Footnote-11200007
+Node: Extension New Mechanism Goals1200064
+Ref: Extension New Mechanism Goals-Footnote-11203428
+Node: Extension Other Design Decisions1203617
+Node: Extension Future Growth1205730
+Node: Old Extension Mechanism1206566
+Node: Notes summary1208329
+Node: Basic Concepts1209511
+Node: Basic High Level1210192
+Ref: figure-general-flow1210474
+Ref: figure-process-flow1211159
+Ref: Basic High Level-Footnote-11214460
+Node: Basic Data Typing1214645
+Node: Glossary1217973
+Node: Copying1249920
+Node: GNU Free Documentation License1287459
+Node: Index1312577

End Tag Table
diff --git a/doc/gawk.texi b/doc/gawk.texi
index bdc64777..2be7d8f1 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -12208,17 +12208,93 @@ compares variables.
@node Variable Typing
@subsubsection String Type versus Numeric Type
+Scalar objects in @command{awk} (variables, array elements, and fields)
+are @emph{dynamically} typed. This means their type can change as the
+program runs, from @dfn{untyped} before any use,@footnote{@command{gawk}
+calls this @dfn{unassigned}, as the following example shows.} to string
+or number, and then from string to number or number to string, as the
+program progresses.
+
+You can't do much with untyped variables, other than tell that they
+are untyped. The following program tests @code{a} against @code{""}
+and @code{0}; the test succeeds when @code{a} has never been assigned
+a value. It also uses the built-in @code{typeof()} function
+(not presented yet; @pxref{Type Functions}) to show @code{a}'s type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ print (a == "" && a == 0 ?}
+> @kbd{"a is untyped" : "a has a type!") ; print typeof(a) @}'}
+@print{} a is untyped
+@print{} unassigned
+@end example
+
+A scalar has numeric type when assigned a numeric value,
+such as from a numeric constant, or from another scalar
+with numeric type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ a = 42 ; print typeof(a)}
+> @kbd{b = a ; print typeof(b) @}'}
+number
+number
+@end example
+
+Similarly, a scalar has string type when assigned a string
+value, such as from a string constant, or from another scalar
+with string type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ a = "forty two" ; print typeof(a)}
+> @kbd{b = a ; print typeof(b) @}'}
+string
+string
+@end example
+
+So far, this is all simple and straightforward. What happens, though,
+when @command{awk} has to process data from a user? Let's start with
+field data. What should the following command produce as output?
+
+@example
+echo hello | awk '@{ printf("%s %s < 42\n", $1,
+ ($1 < 42 ? "is" : "is not")) @}'
+@end example
+
+@noindent
+Since @samp{hello} is alphabetic data, @command{awk} can only do a string
+comparison. Internally, it converts @code{42} into @code{"42"} and compares
+the two string values @code{"hello"} and @code{"42"}. Here's the result:
+
+@example
+$ @kbd{echo hello | awk '@{ printf("%s %s < 42\n", $1,}
+> @kbd{ ($1 < 42 ? "is" : "is not")) @}'}
+@print{} hello is not < 42
+@end example
+
+However, what happens when data from a user @emph{looks like} a number?
+On the one hand, in reality, the input data consists of characters, not
+binary numeric
+values. But, on the other hand, the data looks numeric, and @command{awk}
+really ought to treat it as such. And indeed, it does:
+
+@example
+$ @kbd{echo 37 | awk '@{ printf("%s %s < 42\n", $1,}
+> @kbd{ ($1 < 42 ? "is" : "is not")) @}'}
+@print{} 37 is < 42
+@end example
+
+Here are the rules for when @command{awk}
+treats data as a number, and for when it treats data as a string.
+
@cindex numeric, strings
@cindex strings, numeric
@cindex POSIX @command{awk}, numeric strings and
-The POSIX standard introduced
-the concept of a @dfn{numeric string}, which is simply a string that looks
-like a number---for example, @code{@w{" +2"}}. This concept is used
-for determining the type of a variable.
-The type of the variable is important because the types of two variables
-determine how they are compared.
-Variable typing follows these rules:
+The POSIX standard uses the term @dfn{numeric string} for input data that
+looks numeric. The @samp{37} in the previous example is a numeric string.
+So what is the type of a numeric string? Answer: numeric.
+The type of a variable is important because the types of two variables
+determine how they are compared.
+Variable typing follows these definitions and rules:
@itemize @value{BULLET}
@item
@@ -12233,7 +12309,9 @@ attribute.
Fields, @code{getline} input, @code{FILENAME}, @code{ARGV} elements,
@code{ENVIRON} elements, and the elements of an array created by
@code{match()}, @code{split()}, and @code{patsplit()} that are numeric
-strings have the @dfn{strnum} attribute. Otherwise, they have
+strings have the @dfn{strnum} attribute.@footnote{Thus, a POSIX
+numeric string and @command{gawk}'s strnum are the same thing.}
+Otherwise, they have
the @dfn{string} attribute. Uninitialized variables also have the
@dfn{strnum} attribute.
@@ -12307,7 +12385,7 @@ STRNUM &&string &numeric &numeric\cr
@end tex
@ifnottex
@ifnotdocbook
-@display
+@verbatim
+----------------------------------------------
| STRING NUMERIC STRNUM
--------+----------------------------------------------
@@ -12318,7 +12396,7 @@ NUMERIC | string numeric numeric
|
STRNUM | string numeric numeric
--------+----------------------------------------------
-@end display
+@end verbatim
@end ifnotdocbook
@end ifnottex
@docbook
@@ -12377,10 +12455,14 @@ purposes.
In short, when one operand is a ``pure'' string, such as a string
constant, then a string comparison is performed. Otherwise, a
numeric comparison is performed.
+(The primary difference between a number and a strnum is that
+for strnums @command{gawk} preserves the original string value that
+the scalar had when it came in.)
+
+This point bears additional emphasis:
+Input that looks numeric @emph{is} numeric.
+All other input is treated as strings.
-This point bears additional emphasis: All user input is made of characters,
-and so is first and foremost of string type; input strings
-that look numeric are additionally given the strnum attribute.
Thus, the six-character input string @w{@samp{ +3.14}} receives the
strnum attribute. In contrast, the eight characters
@w{@code{" +3.14"}} appearing in program text comprise a string constant.
@@ -12407,6 +12489,14 @@ $ @kbd{echo ' +3.14' | awk '@{ print($1 == 3.14) @}'} @ii{True}
@print{} 1
@end example
+You can see the type of an input field (or other user input)
+using @code{typeof()}:
+
+@example
+$ @kbd{echo hello 37 | gawk '@{ print typeof($1), typeof($2) @}'}
+@print{} string strnum
+@end example
+
@node Comparison Operators
@subsubsection Comparison Operators
@@ -19644,8 +19734,8 @@ Return one of the following strings, depending upon the type of @var{x}:
@var{x} is a string.
@item "strnum"
-@var{x} is a string that might be a number, such as a field or
-the result of calling @code{split()}. (I.e., @var{x} has the STRNUM
+@var{x} is a number that started life as user input, such as a field or
+the result of calling @code{split()}. (I.e., @var{x} has the strnum
attribute; @pxref{Variable Typing}.)
@item "unassigned"
@@ -19654,8 +19744,9 @@ For example:
@example
BEGIN @{
- a[1] # creates a[1] but it has no assigned value
- print typeof(a[1]) # scalar_u
+ # creates a[1] but it has no assigned value
+ a[1]
+ print typeof(a[1]) # unassigned
@}
@end example
@@ -30707,6 +30798,8 @@ executing, short programs.
The @command{gawk} debugger only accepts source code supplied with the @option{-f} option.
@end itemize
+@ignore
+@c 11/2016: This no longer applies after all the type cleanup work that's been done.
One other point is worth discussing. Conventional debuggers run in a
separate process (and thus address space) from the programs that they
debug (the @dfn{debuggee}, if you will).
@@ -30765,6 +30858,7 @@ is indeed a number, and this is reflected in the result of
Cases like this where the debugger is not transparent to the program's
execution should be rare. If you encounter one, please report it
(@pxref{Bugs}).
+@end ignore
@ignore
Look forward to a future release when these and other missing features may
@@ -32271,14 +32365,26 @@ and is managed by @command{gawk} from then on.
The API defines several simple @code{struct}s that map values as seen
from @command{awk}. A value can be a @code{double}, a string, or an
array (as in multidimensional arrays, or when creating a new array).
+
String values maintain both pointer and length, because embedded @sc{nul}
characters are allowed.
@quotation NOTE
-By intent, strings are maintained using the current multibyte encoding (as
-defined by @env{LC_@var{xxx}} environment variables) and not using wide
-characters. This matches how @command{gawk} stores strings internally
-and also how characters are likely to be input into and output from files.
+By intent, @command{gawk} maintains strings using the current multibyte
+encoding (as defined by @env{LC_@var{xxx}} environment variables)
+and not using wide characters. This matches how @command{gawk} stores
+strings internally and also how characters are likely to be input into
+and output from files.
+@end quotation
+
+@quotation NOTE
+String values passed to an extension by @command{gawk} are always
+@sc{NUL}-terminated. Thus it is safe to pass such string values to
+standard library and system routines. However, because
+@command{gawk} allows embedded @sc{NUL} characters in string data,
+you should check that @samp{strlen(@var{some_string})} matches
+the length for that string passed to the extension before using
+it as a regular C string.
@end quotation
@item
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index efca7b6e..76c3a9b2 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -11527,17 +11527,93 @@ compares variables.
@node Variable Typing
@subsubsection String Type versus Numeric Type
+Scalar objects in @command{awk} (variables, array elements, and fields)
+are @emph{dynamically} typed. This means their type can change as the
+program runs, from @dfn{untyped} before any use,@footnote{@command{gawk}
+calls this @dfn{unassigned}, as the following example shows.} to string
+or number, and then from string to number or number to string, as the
+program progresses.
+
+You can't do much with untyped variables, other than tell that they
+are untyped. The following program tests @code{a} against @code{""}
+and @code{0}; the test succeeds when @code{a} has never been assigned
+a value. It also uses the built-in @code{typeof()} function
+(not presented yet; @pxref{Type Functions}) to show @code{a}'s type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ print (a == "" && a == 0 ?}
+> @kbd{"a is untyped" : "a has a type!") ; print typeof(a) @}'}
+@print{} a is untyped
+@print{} unassigned
+@end example
+
+A scalar has numeric type when assigned a numeric value,
+such as from a numeric constant, or from another scalar
+with numeric type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ a = 42 ; print typeof(a)}
+> @kbd{b = a ; print typeof(b) @}'}
+number
+number
+@end example
+
+Similarly, a scalar has string type when assigned a string
+value, such as from a string constant, or from another scalar
+with string type:
+
+@example
+$ @kbd{gawk 'BEGIN @{ a = "forty two" ; print typeof(a)}
+> @kbd{b = a ; print typeof(b) @}'}
+string
+string
+@end example
+
+So far, this is all simple and straightforward. What happens, though,
+when @command{awk} has to process data from a user? Let's start with
+field data. What should the following command produce as output?
+
+@example
+echo hello | awk '@{ printf("%s %s < 42\n", $1,
+ ($1 < 42 ? "is" : "is not")) @}'
+@end example
+
+@noindent
+Since @samp{hello} is alphabetic data, @command{awk} can only do a string
+comparison. Internally, it converts @code{42} into @code{"42"} and compares
+the two string values @code{"hello"} and @code{"42"}. Here's the result:
+
+@example
+$ @kbd{echo hello | awk '@{ printf("%s %s < 42\n", $1,}
+> @kbd{ ($1 < 42 ? "is" : "is not")) @}'}
+@print{} hello is not < 42
+@end example
+
+However, what happens when data from a user @emph{looks like} a number?
+On the one hand, in reality, the input data consists of characters, not
+binary numeric
+values. But, on the other hand, the data looks numeric, and @command{awk}
+really ought to treat it as such. And indeed, it does:
+
+@example
+$ @kbd{echo 37 | awk '@{ printf("%s %s < 42\n", $1,}
+> @kbd{ ($1 < 42 ? "is" : "is not")) @}'}
+@print{} 37 is < 42
+@end example
+
+Here are the rules for when @command{awk}
+treats data as a number, and for when it treats data as a string.
+
@cindex numeric, strings
@cindex strings, numeric
@cindex POSIX @command{awk}, numeric strings and
-The POSIX standard introduced
-the concept of a @dfn{numeric string}, which is simply a string that looks
-like a number---for example, @code{@w{" +2"}}. This concept is used
-for determining the type of a variable.
-The type of the variable is important because the types of two variables
-determine how they are compared.
-Variable typing follows these rules:
+The POSIX standard uses the term @dfn{numeric string} for input data that
+looks numeric. The @samp{37} in the previous example is a numeric string.
+So what is the type of a numeric string? Answer: numeric.
+The type of a variable is important because the types of two variables
+determine how they are compared.
+Variable typing follows these definitions and rules:
@itemize @value{BULLET}
@item
@@ -11552,7 +11628,9 @@ attribute.
Fields, @code{getline} input, @code{FILENAME}, @code{ARGV} elements,
@code{ENVIRON} elements, and the elements of an array created by
@code{match()}, @code{split()}, and @code{patsplit()} that are numeric
-strings have the @dfn{strnum} attribute. Otherwise, they have
+strings have the @dfn{strnum} attribute.@footnote{Thus, a POSIX
+numeric string and @command{gawk}'s strnum are the same thing.}
+Otherwise, they have
the @dfn{string} attribute. Uninitialized variables also have the
@dfn{strnum} attribute.
@@ -11626,7 +11704,7 @@ STRNUM &&string &numeric &numeric\cr
@end tex
@ifnottex
@ifnotdocbook
-@display
+@verbatim
+----------------------------------------------
| STRING NUMERIC STRNUM
--------+----------------------------------------------
@@ -11637,7 +11715,7 @@ NUMERIC | string numeric numeric
|
STRNUM | string numeric numeric
--------+----------------------------------------------
-@end display
+@end verbatim
@end ifnotdocbook
@end ifnottex
@docbook
@@ -11696,10 +11774,14 @@ purposes.
In short, when one operand is a ``pure'' string, such as a string
constant, then a string comparison is performed. Otherwise, a
numeric comparison is performed.
+(The primary difference between a number and a strnum is that
+for strnums @command{gawk} preserves the original string value that
+the scalar had when it came in.)
+
+This point bears additional emphasis:
+Input that looks numeric @emph{is} numeric.
+All other input is treated as strings.
-This point bears additional emphasis: All user input is made of characters,
-and so is first and foremost of string type; input strings
-that look numeric are additionally given the strnum attribute.
Thus, the six-character input string @w{@samp{ +3.14}} receives the
strnum attribute. In contrast, the eight characters
@w{@code{" +3.14"}} appearing in program text comprise a string constant.
@@ -11726,6 +11808,14 @@ $ @kbd{echo ' +3.14' | awk '@{ print($1 == 3.14) @}'} @ii{True}
@print{} 1
@end example
+You can see the type of an input field (or other user input)
+using @code{typeof()}:
+
+@example
+$ @kbd{echo hello 37 | gawk '@{ print typeof($1), typeof($2) @}'}
+@print{} string strnum
+@end example
+
@node Comparison Operators
@subsubsection Comparison Operators
@@ -18688,8 +18778,8 @@ Return one of the following strings, depending upon the type of @var{x}:
@var{x} is a string.
@item "strnum"
-@var{x} is a string that might be a number, such as a field or
-the result of calling @code{split()}. (I.e., @var{x} has the STRNUM
+@var{x} is a number that started life as user input, such as a field or
+the result of calling @code{split()}. (I.e., @var{x} has the strnum
attribute; @pxref{Variable Typing}.)
@item "unassigned"
@@ -18698,8 +18788,9 @@ For example:
@example
BEGIN @{
- a[1] # creates a[1] but it has no assigned value
- print typeof(a[1]) # scalar_u
+ # creates a[1] but it has no assigned value
+ a[1]
+ print typeof(a[1]) # unassigned
@}
@end example
@@ -29721,6 +29812,8 @@ executing, short programs.
The @command{gawk} debugger only accepts source code supplied with the @option{-f} option.
@end itemize
+@ignore
+@c 11/2016: This no longer applies after all the type cleanup work that's been done.
One other point is worth discussing. Conventional debuggers run in a
separate process (and thus address space) from the programs that they
debug (the @dfn{debuggee}, if you will).
@@ -29779,6 +29872,7 @@ is indeed a number, and this is reflected in the result of
Cases like this where the debugger is not transparent to the program's
execution should be rare. If you encounter one, please report it
(@pxref{Bugs}).
+@end ignore
@ignore
Look forward to a future release when these and other missing features may
@@ -31285,14 +31379,26 @@ and is managed by @command{gawk} from then on.
The API defines several simple @code{struct}s that map values as seen
from @command{awk}. A value can be a @code{double}, a string, or an
array (as in multidimensional arrays, or when creating a new array).
+
String values maintain both pointer and length, because embedded @sc{nul}
characters are allowed.
@quotation NOTE
-By intent, strings are maintained using the current multibyte encoding (as
-defined by @env{LC_@var{xxx}} environment variables) and not using wide
-characters. This matches how @command{gawk} stores strings internally
-and also how characters are likely to be input into and output from files.
+By intent, @command{gawk} maintains strings using the current multibyte
+encoding (as defined by @env{LC_@var{xxx}} environment variables)
+and not using wide characters. This matches how @command{gawk} stores
+strings internally and also how characters are likely to be input into
+and output from files.
+@end quotation
+
+@quotation NOTE
+String values passed to an extension by @command{gawk} are always
+@sc{NUL}-terminated. Thus it is safe to pass such string values to
+standard library and system routines. However, because
+@command{gawk} allows embedded @sc{NUL} characters in string data,
+you should check that @samp{strlen(@var{some_string})} matches
+the length for that string passed to the extension before using
+it as a regular C string.
@end quotation
@item
diff --git a/eval.c b/eval.c
index 8c45ea25..2d07a804 100644
--- a/eval.c
+++ b/eval.c
@@ -606,10 +606,16 @@ cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp)
if (IGNORECASE) {
const unsigned char *cp1 = (const unsigned char *) t1->stptr;
const unsigned char *cp2 = (const unsigned char *) t2->stptr;
+ char save1 = t1->stptr[t1->stlen];
+ char save2 = t2->stptr[t2->stlen];
+
if (gawk_mb_cur_max > 1) {
+ t1->stptr[t1->stlen] = t2->stptr[t2->stlen] = '\0';
ret = strncasecmpmbs((const unsigned char *) cp1,
(const unsigned char *) cp2, l);
+ t1->stptr[t1->stlen] = save1;
+ t2->stptr[t2->stlen] = save2;
} else {
/* Could use tolower() here; see discussion above. */
for (ret = 0; l-- > 0 && ret == 0; cp1++, cp2++)
@@ -853,6 +859,8 @@ fmt_ok(NODE *n)
static const char flags[] = " +-#";
#endif
+ // We rely on the caller to zero-terminate n->stptr.
+
if (*p++ != '%')
return 0;
while (*p && strchr(flags, *p) != NULL) /* flags */
@@ -880,15 +888,21 @@ fmt_index(NODE *n)
int ix = 0;
static int fmt_num = 4;
static int fmt_hiwater = 0;
+ char save;
if (fmt_list == NULL)
emalloc(fmt_list, NODE **, fmt_num*sizeof(*fmt_list), "fmt_index");
n = force_string(n);
+
+ save = n->stptr[n->stlen];
+ n->stptr[n->stlen] = '\0';
+
while (ix < fmt_hiwater) {
if (cmp_nodes(fmt_list[ix], n, true) == 0)
return ix;
ix++;
}
+
/* not found */
if (do_lint && ! fmt_ok(n))
lintwarn(_("bad `%sFMT' specification `%s'"),
@@ -896,6 +910,8 @@ fmt_index(NODE *n)
: n == OFMT_node->var_value ? "O"
: "", n->stptr);
+ n->stptr[n->stlen] = save;
+
if (fmt_hiwater >= fmt_num) {
fmt_num *= 2;
erealloc(fmt_list, NODE **, fmt_num * sizeof(*fmt_list), "fmt_index");
diff --git a/interpret.h b/interpret.h
index 56d2e060..816a6efa 100644
--- a/interpret.h
+++ b/interpret.h
@@ -732,7 +732,7 @@ mod:
erealloc(t1->wstptr, wchar_t *,
sizeof(wchar_t) * (wlen + 1), "r_interpret");
- memcpy(t1->wstptr + t1->wstlen, t2->wstptr, t2->wstlen);
+ memcpy(t1->wstptr + t1->wstlen, t2->wstptr, t2->wstlen * sizeof(wchar_t));
t1->wstlen = wlen;
t1->wstptr[wlen] = L'\0';
t1->flags |= WSTRCUR;
diff --git a/io.c b/io.c
index 4e2c6cf2..688723fd 100644
--- a/io.c
+++ b/io.c
@@ -957,7 +957,7 @@ redirect_string(const char *str, size_t explen, bool not_string,
#endif
direction = "to/from";
if (! two_way_open(str, rp, extfd)) {
- if (! failure_fatal || is_non_fatal_redirect(str)) {
+ if (! failure_fatal || is_non_fatal_redirect(str, explen)) {
*errflg = errno;
/* do not free rp, saving it for reuse (save_rp = rp) */
return NULL;
@@ -1044,7 +1044,7 @@ redirect_string(const char *str, size_t explen, bool not_string,
*/
if (errflg != NULL)
*errflg = errno;
- if (failure_fatal && ! is_non_fatal_redirect(str) &&
+ if (failure_fatal && ! is_non_fatal_redirect(str, explen) &&
(redirtype == redirect_output
|| redirtype == redirect_append)) {
/* multiple messages make life easier for translators */
@@ -1125,10 +1125,21 @@ is_non_fatal_std(FILE *fp)
/* is_non_fatal_redirect --- return true if redirected I/O should be nonfatal */
bool
-is_non_fatal_redirect(const char *str)
+is_non_fatal_redirect(const char *str, size_t len)
{
- return in_PROCINFO(nonfatal, NULL, NULL) != NULL
- || in_PROCINFO(str, nonfatal, NULL) != NULL;
+ bool ret;
+ char save;
+ char *s = (char *) str;
+
+ save = s[len];
+ s[len] = '\0';
+
+ ret = in_PROCINFO(nonfatal, NULL, NULL) != NULL
+ || in_PROCINFO(s, nonfatal, NULL) != NULL;
+
+ s[len] = save;
+
+ return ret;
}
/* close_one --- temporarily close an open file to re-use the fd */
@@ -1182,7 +1193,11 @@ do_close(int nargs)
if (nargs == 2) {
/* 2nd arg if present: "to" or "from" for two-way pipe */
/* DO NOT use _() on the strings here! */
+ char save;
+
tmp2 = POP_STRING();
+ save = tmp2->stptr[tmp2->stlen];
+ tmp2->stptr[tmp2->stlen] = '\0';
if (strcasecmp(tmp2->stptr, "to") == 0)
how = CLOSE_TO;
else if (strcasecmp(tmp2->stptr, "from") == 0)
@@ -1191,6 +1206,7 @@ do_close(int nargs)
DEREF(tmp2);
fatal(_("close: second argument must be `to' or `from'"));
}
+ tmp2->stptr[tmp2->stlen] = save;
DEREF(tmp2);
}
@@ -1733,7 +1749,7 @@ devopen(const char *name, const char *mode)
unsigned long retries = 0;
static long msleep = 1000;
bool hard_error = false;
- bool non_fatal = is_non_fatal_redirect(name);
+ bool non_fatal = is_non_fatal_redirect(name, strlen(name));
cp = (char *) name;
@@ -2619,7 +2635,7 @@ do_getline_redir(int into_variable, enum redirval redirtype)
}
return make_number((AWKNUM) -1.0);
} else if ((rp->flag & RED_TWOWAY) != 0 && rp->iop == NULL) {
- if (is_non_fatal_redirect(redir_exp->stptr)) {
+ if (is_non_fatal_redirect(redir_exp->stptr, redir_exp->stlen)) {
update_ERRNO_int(EBADF);
return make_number((AWKNUM) -1.0);
}
diff --git a/mpfr.c b/mpfr.c
index ddf020dd..c0f1ff0c 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -504,11 +504,11 @@ set_PREC()
mpfr_exp_t emax;
mpfr_exp_t emin;
} ieee_fmts[] = {
-{ "half", 11, 16, -23 }, /* binary16 */
-{ "single", 24, 128, -148 }, /* binary32 */
-{ "double", 53, 1024, -1073 }, /* binary64 */
-{ "quad", 113, 16384, -16493 }, /* binary128 */
-{ "oct", 237, 262144, -262377 }, /* binary256, not in the IEEE 754-2008 standard */
+ { "half", 11, 16, -23 }, /* binary16 */
+ { "single", 24, 128, -148 }, /* binary32 */
+ { "double", 53, 1024, -1073 }, /* binary64 */
+ { "quad", 113, 16384, -16493 }, /* binary128 */
+ { "oct", 237, 262144, -262377 }, /* binary256, not in the IEEE 754-2008 standard */
/*
* For any bitwidth = 32 * k ( k >= 4),
@@ -1081,6 +1081,8 @@ do_mpfr_strtonum(int nargs)
force_mpnum(r, true, use_lc_numeric);
r->stptr = NULL;
r->stlen = 0;
+ r->wstptr = NULL;
+ r->wstlen = 0;
} else if (is_mpg_float(tmp)) {
int tval;
r = mpg_float();
diff --git a/msg.c b/msg.c
index ffca3355..12fc18f5 100644
--- a/msg.c
+++ b/msg.c
@@ -76,19 +76,21 @@ err(bool isfatal, const char *s, const char *emsg, va_list argp)
val = mpg_update_var(FNR_node);
assert((val->flags & MPZN) != 0);
if (mpz_sgn(val->mpg_i) > 0) {
+ int len = FILENAME_node->var_value->stlen;
file = FILENAME_node->var_value->stptr;
(void) putc('(', stderr);
if (file)
- (void) fprintf(stderr, "FILENAME=%s ", file);
+ (void) fprintf(stderr, "FILENAME=%.*s ", len, file);
(void) mpfr_fprintf(stderr, "FNR=%Zd) ", val->mpg_i);
}
} else
#endif
if (FNR > 0) {
+ int len = FILENAME_node->var_value->stlen;
file = FILENAME_node->var_value->stptr;
(void) putc('(', stderr);
if (file)
- (void) fprintf(stderr, "FILENAME=%s ", file);
+ (void) fprintf(stderr, "FILENAME=%.*s ", len, file);
(void) fprintf(stderr, "FNR=%ld) ", FNR);
}