diff options
Diffstat (limited to 'sed/compile.c')
-rw-r--r-- | sed/compile.c | 1734 |
1 files changed, 1734 insertions, 0 deletions
diff --git a/sed/compile.c b/sed/compile.c new file mode 100644 index 0000000..513fac5 --- /dev/null +++ b/sed/compile.c @@ -0,0 +1,1734 @@ +/* GNU SED, a batch stream editor. + Copyright (C) 1989,90,91,92,93,94,95,98,99,2002,2003,2004,2005,2006,2008,2010 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* compile.c: translate sed source into internal form */ + +#include "sed.h" +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <sys/types.h> +#include <obstack.h> + + +#define YMAP_LENGTH 256 /*XXX shouldn't this be (UCHAR_MAX+1)?*/ +#define VECTOR_ALLOC_INCREMENT 40 + +/* let's not confuse text editors that have only dumb bracket-matching... */ +#define OPEN_BRACKET '[' +#define CLOSE_BRACKET ']' +#define OPEN_BRACE '{' +#define CLOSE_BRACE '}' + +struct prog_info { + /* When we're reading a script command from a string, `prog.base' + points to the first character in the string, 'prog.cur' points + to the current character in the string, and 'prog.end' points + to the end of the string. This allows us to compile script + strings that contain nulls. */ + const unsigned char *base; + const unsigned char *cur; + const unsigned char *end; + + /* This is the current script file. If it is NULL, we are reading + from a string stored at `prog.cur' instead. If both `prog.file' + and `prog.cur' are NULL, we're in trouble! */ + FILE *file; +}; + +/* Information used to give out useful and informative error messages. */ +struct error_info { + /* This is the name of the current script file. */ + const char *name; + + /* This is the number of the current script line that we're compiling. */ + countT line; + + /* This is the index of the "-e" expressions on the command line. */ + countT string_expr_count; +}; + + +/* Label structure used to resolve GOTO's, labels, and block beginnings. */ +struct sed_label { + countT v_index; /* index of vector element being referenced */ + char *name; /* NUL-terminated name of the label */ + struct error_info err_info; /* track where `{}' blocks start */ + struct sed_label *next; /* linked list (stack) */ +}; + +struct special_files { + struct output outf; + FILE **pfp; +}; + +FILE *my_stdin, *my_stdout, *my_stderr; +struct special_files special_files[] = { + { { "/dev/stdin", false, NULL, NULL }, &my_stdin }, + { { "/dev/stdout", false, NULL, NULL }, &my_stdout }, + { { "/dev/stderr", false, NULL, NULL }, &my_stderr }, + { { NULL, false, NULL, NULL }, NULL } +}; + + +/* Where we are in the processing of the input. */ +static struct prog_info prog; +static struct error_info cur_input; + +/* Information about labels and jumps-to-labels. This is used to do + the required backpatching after we have compiled all the scripts. */ +static struct sed_label *jumps = NULL; +static struct sed_label *labels = NULL; + +/* We wish to detect #n magic only in the first input argument; + this flag tracks when we have consumed the first file of input. */ +static bool first_script = true; + +/* Allow for scripts like "sed -e 'i\' -e foo": */ +static struct buffer *pending_text = NULL; +static struct text_buf *old_text_buf = NULL; + +/* Information about block start positions. This is used to backpatch + block end positions. */ +static struct sed_label *blocks = NULL; + +/* Use an obstack for compilation. */ +static struct obstack obs; + +/* Various error messages we may want to print */ +static const char errors[] = + "multiple `!'s\0" + "unexpected `,'\0" + "invalid usage of +N or ~N as first address\0" + "unmatched `{'\0" + "unexpected `}'\0" + "extra characters after command\0" + "expected \\ after `a', `c' or `i'\0" + "`}' doesn't want any addresses\0" + ": doesn't want any addresses\0" + "comments don't accept any addresses\0" + "missing command\0" + "command only uses one address\0" + "unterminated address regex\0" + "unterminated `s' command\0" + "unterminated `y' command\0" + "unknown option to `s'\0" + "multiple `p' options to `s' command\0" + "multiple `g' options to `s' command\0" + "multiple number options to `s' command\0" + "number option to `s' command may not be zero\0" + "strings for `y' command are different lengths\0" + "delimiter character is not a single-byte character\0" + "expected newer version of sed\0" + "invalid usage of line address 0\0" + "unknown command: `%c'\0" + "incomplete command"; + +#define BAD_BANG (errors) +#define BAD_COMMA (BAD_BANG + sizeof(N_("multiple `!'s"))) +#define BAD_STEP (BAD_COMMA + sizeof(N_("unexpected `,'"))) +#define EXCESS_OPEN_BRACE (BAD_STEP + sizeof(N_("invalid usage of +N or ~N as first address"))) +#define EXCESS_CLOSE_BRACE (EXCESS_OPEN_BRACE + sizeof(N_("unmatched `{'"))) +#define EXCESS_JUNK (EXCESS_CLOSE_BRACE + sizeof(N_("unexpected `}'"))) +#define EXPECTED_SLASH (EXCESS_JUNK + sizeof(N_("extra characters after command"))) +#define NO_CLOSE_BRACE_ADDR (EXPECTED_SLASH + sizeof(N_("expected \\ after `a', `c' or `i'"))) +#define NO_COLON_ADDR (NO_CLOSE_BRACE_ADDR + sizeof(N_("`}' doesn't want any addresses"))) +#define NO_SHARP_ADDR (NO_COLON_ADDR + sizeof(N_(": doesn't want any addresses"))) +#define NO_COMMAND (NO_SHARP_ADDR + sizeof(N_("comments don't accept any addresses"))) +#define ONE_ADDR (NO_COMMAND + sizeof(N_("missing command"))) +#define UNTERM_ADDR_RE (ONE_ADDR + sizeof(N_("command only uses one address"))) +#define UNTERM_S_CMD (UNTERM_ADDR_RE + sizeof(N_("unterminated address regex"))) +#define UNTERM_Y_CMD (UNTERM_S_CMD + sizeof(N_("unterminated `s' command"))) +#define UNKNOWN_S_OPT (UNTERM_Y_CMD + sizeof(N_("unterminated `y' command"))) +#define EXCESS_P_OPT (UNKNOWN_S_OPT + sizeof(N_("unknown option to `s'"))) +#define EXCESS_G_OPT (EXCESS_P_OPT + sizeof(N_("multiple `p' options to `s' command"))) +#define EXCESS_N_OPT (EXCESS_G_OPT + sizeof(N_("multiple `g' options to `s' command"))) +#define ZERO_N_OPT (EXCESS_N_OPT + sizeof(N_("multiple number options to `s' command"))) +#define Y_CMD_LEN (ZERO_N_OPT + sizeof(N_("number option to `s' command may not be zero"))) +#define BAD_DELIM (Y_CMD_LEN + sizeof(N_("strings for `y' command are different lengths"))) +#define ANCIENT_VERSION (BAD_DELIM + sizeof(N_("delimiter character is not a single-byte character"))) +#define INVALID_LINE_0 (ANCIENT_VERSION + sizeof(N_("expected newer version of sed"))) +#define UNKNOWN_CMD (INVALID_LINE_0 + sizeof(N_("invalid usage of line address 0"))) +#define INCOMPLETE_CMD (UNKNOWN_CMD + sizeof(N_("unknown command: `%c'"))) +#define END_ERRORS (INCOMPLETE_CMD + sizeof(N_("incomplete command"))) + +static struct output *file_read = NULL; +static struct output *file_write = NULL; + + +/* Complain about an unknown command and exit. */ +void +bad_command(ch) + char ch; +{ + const char *msg = _(UNKNOWN_CMD); + char *unknown_cmd = xmalloc(strlen(msg)); + sprintf(unknown_cmd, msg, ch); + bad_prog(unknown_cmd); +} + +/* Complain about a programming error and exit. */ +void +bad_prog(why) + const char *why; +{ + if (cur_input.name) + fprintf(stderr, _("%s: file %s line %lu: %s\n"), + myname, cur_input.name, (unsigned long)cur_input.line, why); + else + fprintf(stderr, _("%s: -e expression #%lu, char %lu: %s\n"), + myname, + (unsigned long)cur_input.string_expr_count, + (unsigned long)(prog.cur-prog.base), + why); + exit(EXIT_FAILURE); +} + + +/* Read the next character from the program. Return EOF if there isn't + anything to read. Keep cur_input.line up to date, so error messages + can be meaningful. */ +static int inchar (void); +static int +inchar() +{ + int ch = EOF; + + if (prog.cur) + { + if (prog.cur < prog.end) + ch = *prog.cur++; + } + else if (prog.file) + { + if (!feof(prog.file)) + ch = getc(prog.file); + } + if (ch == '\n') + ++cur_input.line; + return ch; +} + +/* unget `ch' so the next call to inchar will return it. */ +static void savchar (int ch); +static void +savchar(ch) + int ch; +{ + if (ch == EOF) + return; + if (ch == '\n' && cur_input.line > 0) + --cur_input.line; + if (prog.cur) + { + if (prog.cur <= prog.base || *--prog.cur != ch) + panic("Called savchar() with unexpected pushback (%x)", + (unsigned char)ch); + } + else + ungetc(ch, prog.file); +} + +/* Read the next non-blank character from the program. */ +static int in_nonblank (void); +static int +in_nonblank() +{ + int ch; + do + ch = inchar(); + while (ISBLANK(ch)); + return ch; +} + +/* Read an integer value from the program. */ +static countT in_integer (int ch); +static countT +in_integer(ch) + int ch; +{ + countT num = 0; + + while (ISDIGIT(ch)) + { + num = num * 10 + ch - '0'; + ch = inchar(); + } + savchar(ch); + return num; +} + +static int add_then_next (struct buffer *b, int ch); +static int +add_then_next(b, ch) + struct buffer *b; + int ch; +{ + add1_buffer(b, ch); + return inchar(); +} + +static char * convert_number (char *, char *, const char *, int); +static char * +convert_number(result, buf, bufend, base) + char *result; + char *buf; + const char *bufend; + int base; +{ + int n = 0; + int max = 1; + char *p; + + for (p=buf+1; p < bufend && max <= 255; ++p, max *= base) + { + int d = -1; + switch (*p) + { + case '0': d = 0x0; break; + case '1': d = 0x1; break; + case '2': d = 0x2; break; + case '3': d = 0x3; break; + case '4': d = 0x4; break; + case '5': d = 0x5; break; + case '6': d = 0x6; break; + case '7': d = 0x7; break; + case '8': d = 0x8; break; + case '9': d = 0x9; break; + case 'A': case 'a': d = 0xa; break; + case 'B': case 'b': d = 0xb; break; + case 'C': case 'c': d = 0xc; break; + case 'D': case 'd': d = 0xd; break; + case 'E': case 'e': d = 0xe; break; + case 'F': case 'f': d = 0xf; break; + } + if (d < 0 || base <= d) + break; + n = n * base + d; + } + if (p == buf+1) + *result = *buf; + else + *result = n; + return p; +} + + +/* Read in a filename for a `r', `w', or `s///w' command. */ +static struct buffer *read_filename (void); +static struct buffer * +read_filename() +{ + struct buffer *b; + int ch; + + b = init_buffer(); + ch = in_nonblank(); + while (ch != EOF && ch != '\n') + { +#if 0 /*XXX ZZZ 1998-09-12 kpp: added, then had second thoughts*/ + if (posixicity == POSIXLY_EXTENDED) + if (ch == ';' || ch == '#') + { + savchar(ch); + break; + } +#endif + ch = add_then_next(b, ch); + } + add1_buffer(b, '\0'); + return b; +} + +static struct output *get_openfile (struct output **file_ptrs, const char *mode, int fail); +static struct output * +get_openfile(file_ptrs, mode, fail) + struct output **file_ptrs; + const char *mode; + int fail; +{ + struct buffer *b; + char *file_name; + struct output *p; + + b = read_filename(); + file_name = get_buffer(b); + for (p=*file_ptrs; p; p=p->link) + if (strcmp(p->name, file_name) == 0) + break; + + if (posixicity == POSIXLY_EXTENDED) + { + /* Check whether it is a special file (stdin, stdout or stderr) */ + struct special_files *special = special_files; + + /* std* sometimes are not constants, so they + cannot be used in the initializer for special_files */ + my_stdin = stdin; my_stdout = stdout; my_stderr = stderr; + for (special = special_files; special->outf.name; special++) + if (strcmp(special->outf.name, file_name) == 0) + { + special->outf.fp = *special->pfp; + free_buffer (b); + return &special->outf; + } + } + + if (!p) + { + p = OB_MALLOC(&obs, 1, struct output); + p->name = ck_strdup(file_name); + p->fp = ck_fopen(p->name, mode, fail); + p->missing_newline = false; + p->link = *file_ptrs; + *file_ptrs = p; + } + free_buffer(b); + return p; +} + + +static struct sed_cmd *next_cmd_entry (struct vector **vectorp); +static struct sed_cmd * +next_cmd_entry(vectorp) + struct vector **vectorp; +{ + struct sed_cmd *cmd; + struct vector *v; + + v = *vectorp; + if (v->v_length == v->v_allocated) + { + v->v_allocated += VECTOR_ALLOC_INCREMENT; + v->v = REALLOC(v->v, v->v_allocated, struct sed_cmd); + } + + cmd = v->v + v->v_length; + cmd->a1 = NULL; + cmd->a2 = NULL; + cmd->range_state = RANGE_INACTIVE; + cmd->addr_bang = false; + cmd->cmd = '\0'; /* something invalid, to catch bugs early */ + + *vectorp = v; + return cmd; +} + +static int snarf_char_class (struct buffer *b, mbstate_t *cur_stat); +static int +snarf_char_class(b, cur_stat) + struct buffer *b; + mbstate_t *cur_stat; +{ + int ch; + int state = 0; + int delim; + bool pending_mb = 0; + + ch = inchar(); + if (ch == '^') + ch = add_then_next(b, ch); + if (ch == CLOSE_BRACKET) + ch = add_then_next(b, ch); + + /* States are: + 0 outside a collation element, character class or collation class + 1 after the bracket + 2 after the opening ./:/= + 3 after the closing ./:/= */ + + for (;; ch = add_then_next (b, ch)) + { + pending_mb = BRLEN (ch, cur_stat) != 1; + + switch (ch) + { + case EOF: + case '\n': + return ch; + + case '.': + case ':': + case '=': + if (pending_mb) + continue; + + if (state == 1) + { + delim = ch; + state = 2; + } + else if (state == 2 && ch == delim) + state = 3; + else + break; + + continue; + + case OPEN_BRACKET: + if (pending_mb) + continue; + + if (state == 0) + state = 1; + continue; + + case CLOSE_BRACKET: + if (pending_mb) + continue; + + if (state == 0 || state == 1) + return ch; + else if (state == 3) + state = 0; + + break; + + default: + break; + } + + /* Getting a character different from .=: whilst in state 1 + goes back to state 0, getting a character different from ] + whilst in state 3 goes back to state 2. */ + state &= ~1; + } +} + +static struct buffer *match_slash (int slash, int regex); +static struct buffer * +match_slash(slash, regex) + int slash; + int regex; +{ + struct buffer *b; + int ch; + mbstate_t cur_stat; + + memset (&cur_stat, 0, sizeof (mbstate_t)); + + /* We allow only 1 byte characters for a slash. */ + if (BRLEN (slash, &cur_stat) == -2) + bad_prog (BAD_DELIM); + + memset (&cur_stat, 0, sizeof (mbstate_t)); + + b = init_buffer(); + while ((ch = inchar()) != EOF && ch != '\n') + { + bool pending_mb = !MBSINIT (&cur_stat); + if (BRLEN (ch, &cur_stat) == 1 && !pending_mb) + { + if (ch == slash) + return b; + else if (ch == '\\') + { + ch = inchar(); + if (ch == EOF) + break; +#ifndef REG_PERL + else if (ch == 'n' && regex) + ch = '\n'; +#endif + else if (ch != '\n' && (ch != slash || (!regex && ch == '&'))) + add1_buffer(b, '\\'); + } + else if (ch == OPEN_BRACKET && regex) + { + add1_buffer(b, ch); + ch = snarf_char_class(b, &cur_stat); + if (ch != CLOSE_BRACKET) + break; + } + } + + add1_buffer(b, ch); + } + + if (ch == '\n') + savchar(ch); /* for proper line number in error report */ + free_buffer(b); + return NULL; +} + +static int mark_subst_opts (struct subst *cmd); +static int +mark_subst_opts(cmd) + struct subst *cmd; +{ + int flags = 0; + int ch; + + cmd->global = false; + cmd->print = false; + cmd->eval = false; + cmd->numb = 0; + cmd->outf = NULL; + + for (;;) + switch ( (ch = in_nonblank()) ) + { + case 'i': /* GNU extension */ + case 'I': /* GNU extension */ + if (posixicity == POSIXLY_BASIC) + bad_prog(_(UNKNOWN_S_OPT)); + flags |= REG_ICASE; + break; + +#ifdef REG_PERL + case 's': /* GNU extension */ + case 'S': /* GNU extension */ + if (posixicity == POSIXLY_BASIC) + bad_prog(_(UNKNOWN_S_OPT)); + if (extended_regexp_flags & REG_PERL) + flags |= REG_DOTALL; + break; + + case 'x': /* GNU extension */ + case 'X': /* GNU extension */ + if (posixicity == POSIXLY_BASIC) + bad_prog(_(UNKNOWN_S_OPT)); + if (extended_regexp_flags & REG_PERL) + flags |= REG_EXTENDED; + break; +#endif + + case 'm': /* GNU extension */ + case 'M': /* GNU extension */ + if (posixicity == POSIXLY_BASIC) + bad_prog(_(UNKNOWN_S_OPT)); + flags |= REG_NEWLINE; + break; + + case 'e': + cmd->eval = true; + break; + + case 'p': + if (cmd->print) + bad_prog(_(EXCESS_P_OPT)); + cmd->print |= (1 << cmd->eval); /* 1=before eval, 2=after */ + break; + + case 'g': + if (cmd->global) + bad_prog(_(EXCESS_G_OPT)); + cmd->global = true; + break; + + case 'w': + cmd->outf = get_openfile(&file_write, write_mode, true); + return flags; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (cmd->numb) + bad_prog(_(EXCESS_N_OPT)); + cmd->numb = in_integer(ch); + if (!cmd->numb) + bad_prog(_(ZERO_N_OPT)); + break; + + case CLOSE_BRACE: + case '#': + savchar(ch); + /* Fall Through */ + case EOF: + case '\n': + case ';': + return flags; + + case '\r': + if (inchar() == '\n') + return flags; + /* FALLTHROUGH */ + + default: + bad_prog(_(UNKNOWN_S_OPT)); + /*NOTREACHED*/ + } +} + + +/* read in a label for a `:', `b', or `t' command */ +static char *read_label (void); +static char * +read_label() +{ + struct buffer *b; + int ch; + char *ret; + + b = init_buffer(); + ch = in_nonblank(); + + while (ch != EOF && ch != '\n' + && !ISBLANK(ch) && ch != ';' && ch != CLOSE_BRACE && ch != '#') + ch = add_then_next (b, ch); + + savchar(ch); + add1_buffer(b, '\0'); + ret = ck_strdup(get_buffer(b)); + free_buffer(b); + return ret; +} + +/* Store a label (or label reference) created by a `:', `b', or `t' + command so that the jump to/from the label can be backpatched after + compilation is complete, or a reference created by a `{' to be + backpatched when the corresponding `}' is found. */ +static struct sed_label *setup_label + (struct sed_label *, countT, char *, const struct error_info *); +static struct sed_label * +setup_label(list, idx, name, err_info) + struct sed_label *list; + countT idx; + char *name; + const struct error_info *err_info; +{ + struct sed_label *ret = OB_MALLOC(&obs, 1, struct sed_label); + ret->v_index = idx; + ret->name = name; + if (err_info) + memcpy(&ret->err_info, err_info, sizeof (ret->err_info)); + ret->next = list; + return ret; +} + +static struct sed_label *release_label (struct sed_label *list_head); +static struct sed_label * +release_label(list_head) + struct sed_label *list_head; +{ + struct sed_label *ret; + + if (!list_head) + return NULL; + ret = list_head->next; + + free(list_head->name); + +#if 0 + /* We use obstacks */ + free(list_head); +#endif + return ret; +} + +static struct replacement * +new_replacement(char *text, size_t length, enum replacement_types type) +{ + struct replacement *r = OB_MALLOC(&obs, 1, struct replacement); + + r->prefix = text; + r->prefix_length = length; + r->subst_id = -1; + r->repl_type = type; + + /* r-> next = NULL; */ + return r; +} + +static void setup_replacement (struct subst *, const char *, size_t); +static void +setup_replacement(sub, text, length) + struct subst *sub; + const char *text; + size_t length; +{ + char *base; + char *p; + char *text_end; + enum replacement_types repl_type = REPL_ASIS, save_type = REPL_ASIS; + struct replacement root; + struct replacement *tail; + + sub->max_id = 0; + base = MEMDUP(text, length, char); + length = normalize_text(base, length, TEXT_REPLACEMENT); + + text_end = base + length; + tail = &root; + + for (p=base; p<text_end; ++p) + { + if (*p == '\\') + { + /* Preceding the backslash may be some literal text: */ + tail = tail->next = + new_replacement(base, (size_t)(p - base), repl_type); + + repl_type = save_type; + + /* Skip the backslash and look for a numeric back-reference, + or a case-munging escape if not in POSIX mode: */ + ++p; + if (p == text_end) + ++tail->prefix_length; + + else if (posixicity == POSIXLY_BASIC && !ISDIGIT (*p)) + { + p[-1] = *p; + ++tail->prefix_length; + } + + else + switch (*p) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + tail->subst_id = *p - '0'; + if (sub->max_id < tail->subst_id) + sub->max_id = tail->subst_id; + break; + + case 'L': + repl_type = REPL_LOWERCASE; + save_type = REPL_LOWERCASE; + break; + + case 'U': + repl_type = REPL_UPPERCASE; + save_type = REPL_UPPERCASE; + break; + + case 'E': + repl_type = REPL_ASIS; + save_type = REPL_ASIS; + break; + + case 'l': + save_type = repl_type; + repl_type |= REPL_LOWERCASE_FIRST; + break; + + case 'u': + save_type = repl_type; + repl_type |= REPL_UPPERCASE_FIRST; + break; + + default: + p[-1] = *p; + ++tail->prefix_length; + } + + base = p + 1; + } + else if (*p == '&') + { + /* Preceding the ampersand may be some literal text: */ + tail = tail->next = + new_replacement(base, (size_t)(p - base), repl_type); + + repl_type = save_type; + tail->subst_id = 0; + base = p + 1; + } + } + /* There may be some trailing literal text: */ + if (base < text_end) + tail = tail->next = + new_replacement(base, (size_t)(text_end - base), repl_type); + + tail->next = NULL; + sub->replacement = root.next; +} + +static void read_text (struct text_buf *buf, int leadin_ch); +static void +read_text(buf, leadin_ch) + struct text_buf *buf; + int leadin_ch; +{ + int ch; + + /* Should we start afresh (as opposed to continue a partial text)? */ + if (buf) + { + if (pending_text) + free_buffer(pending_text); + pending_text = init_buffer(); + buf->text = NULL; + buf->text_length = 0; + old_text_buf = buf; + } + /* assert(old_text_buf != NULL); */ + + if (leadin_ch == EOF) + return; + + if (leadin_ch != '\n') + add1_buffer(pending_text, leadin_ch); + + ch = inchar(); + while (ch != EOF && ch != '\n') + { + if (ch == '\\') + { + ch = inchar(); + if (ch != EOF) + add1_buffer (pending_text, '\\'); + } + + if (ch == EOF) + { + add1_buffer (pending_text, '\n'); + return; + } + + ch = add_then_next (pending_text, ch); + } + + add1_buffer(pending_text, '\n'); + if (!buf) + buf = old_text_buf; + buf->text_length = normalize_text (get_buffer (pending_text), + size_buffer (pending_text), TEXT_BUFFER); + buf->text = MEMDUP(get_buffer(pending_text), buf->text_length, char); + free_buffer(pending_text); + pending_text = NULL; +} + + +/* Try to read an address for a sed command. If it succeeds, + return non-zero and store the resulting address in `*addr'. + If the input doesn't look like an address read nothing + and return zero. */ +static bool compile_address (struct addr *addr, int ch); +static bool +compile_address(addr, ch) + struct addr *addr; + int ch; +{ + addr->addr_type = ADDR_IS_NULL; + addr->addr_step = 0; + addr->addr_number = ~(countT)0; /* extremely unlikely to ever match */ + addr->addr_regex = NULL; + + if (ch == '/' || ch == '\\') + { + int flags = 0; + struct buffer *b; + addr->addr_type = ADDR_IS_REGEX; + if (ch == '\\') + ch = inchar(); + if ( !(b = match_slash(ch, true)) ) + bad_prog(_(UNTERM_ADDR_RE)); + + for(;;) + { + ch = in_nonblank(); + if (posixicity == POSIXLY_BASIC) + goto posix_address_modifier; + switch(ch) + { + case 'I': /* GNU extension */ + flags |= REG_ICASE; + break; + +#ifdef REG_PERL + case 'S': /* GNU extension */ + if (extended_regexp_flags & REG_PERL) + flags |= REG_DOTALL; + break; + + case 'X': /* GNU extension */ + if (extended_regexp_flags & REG_PERL) + flags |= REG_EXTENDED; + break; +#endif + + case 'M': /* GNU extension */ + flags |= REG_NEWLINE; + break; + + default: + posix_address_modifier: + savchar (ch); + addr->addr_regex = compile_regex (b, flags, 0); + free_buffer(b); + return true; + } + } + } + else if (ISDIGIT(ch)) + { + addr->addr_number = in_integer(ch); + addr->addr_type = ADDR_IS_NUM; + ch = in_nonblank(); + if (ch != '~' || posixicity == POSIXLY_BASIC) + { + savchar(ch); + } + else + { + countT step = in_integer(in_nonblank()); + if (step > 0) + { + addr->addr_step = step; + addr->addr_type = ADDR_IS_NUM_MOD; + } + } + } + else if ((ch == '+' || ch == '~') && posixicity != POSIXLY_BASIC) + { + addr->addr_step = in_integer(in_nonblank()); + if (addr->addr_step==0) + ; /* default to ADDR_IS_NULL; forces matching to stop on next line */ + else if (ch == '+') + addr->addr_type = ADDR_IS_STEP; + else + addr->addr_type = ADDR_IS_STEP_MOD; + } + else if (ch == '$') + { + addr->addr_type = ADDR_IS_LAST; + } + else + return false; + + return true; +} + +/* Read a program (or a subprogram within `{' `}' pairs) in and store + the compiled form in `*vector'. Return a pointer to the new vector. */ +static struct vector *compile_program (struct vector *); +static struct vector * +compile_program(vector) + struct vector *vector; +{ + struct sed_cmd *cur_cmd; + struct buffer *b; + int ch; + + if (!vector) + { + vector = MALLOC(1, struct vector); + vector->v = NULL; + vector->v_allocated = 0; + vector->v_length = 0; + + obstack_init (&obs); + } + if (pending_text) + read_text(NULL, '\n'); + + for (;;) + { + struct addr a; + + while ((ch=inchar()) == ';' || ISSPACE(ch)) + ; + if (ch == EOF) + break; + + cur_cmd = next_cmd_entry(&vector); + if (compile_address(&a, ch)) + { + if (a.addr_type == ADDR_IS_STEP + || a.addr_type == ADDR_IS_STEP_MOD) + bad_prog(_(BAD_STEP)); + + cur_cmd->a1 = MEMDUP(&a, 1, struct addr); + ch = in_nonblank(); + if (ch == ',') + { + if (!compile_address(&a, in_nonblank())) + bad_prog(_(BAD_COMMA)); + + cur_cmd->a2 = MEMDUP(&a, 1, struct addr); + ch = in_nonblank(); + } + + if ((cur_cmd->a1->addr_type == ADDR_IS_NUM + && cur_cmd->a1->addr_number == 0) + && ((!cur_cmd->a2 || cur_cmd->a2->addr_type != ADDR_IS_REGEX) + || posixicity == POSIXLY_BASIC)) + bad_prog(_(INVALID_LINE_0)); + } + if (ch == '!') + { + cur_cmd->addr_bang = true; + ch = in_nonblank(); + if (ch == '!') + bad_prog(_(BAD_BANG)); + } + + /* Do not accept extended commands in --posix mode. Also, + a few commands only accept one address in that mode. */ + if (posixicity == POSIXLY_BASIC) + switch (ch) + { + case 'e': case 'F': case 'v': case 'z': case 'L': + case 'Q': case 'T': case 'R': case 'W': + bad_command(ch); + + case 'a': case 'i': case 'l': + case '=': case 'r': + if (cur_cmd->a2) + bad_prog(_(ONE_ADDR)); + } + + cur_cmd->cmd = ch; + switch (ch) + { + case '#': + if (cur_cmd->a1) + bad_prog(_(NO_SHARP_ADDR)); + ch = inchar(); + if (ch=='n' && first_script && cur_input.line < 2) + if ( (prog.base && prog.cur==2+prog.base) + || (prog.file && !prog.base && 2==ftell(prog.file))) + no_default_output = true; + while (ch != EOF && ch != '\n') + ch = inchar(); + continue; /* restart the for (;;) loop */ + + case 'v': + /* This is an extension. Programs needing GNU sed might start + * with a `v' command so that other seds will stop. + * We compare the version and ignore POSIXLY_CORRECT. + */ + { + char *version = read_label (); + char *compared_version; + compared_version = (*version == '\0') ? "4.0" : version; + if (strverscmp (compared_version, SED_FEATURE_VERSION) > 0) + bad_prog(_(ANCIENT_VERSION)); + + free (version); + posixicity = POSIXLY_EXTENDED; + } + continue; + + case '{': + blocks = setup_label(blocks, vector->v_length, NULL, &cur_input); + cur_cmd->addr_bang = !cur_cmd->addr_bang; + break; + + case '}': + if (!blocks) + bad_prog(_(EXCESS_CLOSE_BRACE)); + if (cur_cmd->a1) + bad_prog(_(NO_CLOSE_BRACE_ADDR)); + ch = in_nonblank(); + if (ch == CLOSE_BRACE || ch == '#') + savchar(ch); + else if (ch != EOF && ch != '\n' && ch != ';') + bad_prog(_(EXCESS_JUNK)); + + vector->v[blocks->v_index].x.jump_index = vector->v_length; + blocks = release_label(blocks); /* done with this entry */ + break; + + case 'e': + ch = in_nonblank(); + if (ch == EOF || ch == '\n') + { + cur_cmd->x.cmd_txt.text_length = 0; + break; + } + else + goto read_text_to_slash; + + case 'a': + case 'i': + case 'c': + ch = in_nonblank(); + + read_text_to_slash: + if (ch == EOF) + bad_prog(_(EXPECTED_SLASH)); + + if (ch == '\\') + ch = inchar(); + else + { + if (posixicity == POSIXLY_BASIC) + bad_prog(_(EXPECTED_SLASH)); + savchar(ch); + ch = '\n'; + } + + read_text(&cur_cmd->x.cmd_txt, ch); + break; + + case ':': + if (cur_cmd->a1) + bad_prog(_(NO_COLON_ADDR)); + labels = setup_label(labels, vector->v_length, read_label(), NULL); + break; + + case 'T': + case 'b': + case 't': + jumps = setup_label(jumps, vector->v_length, read_label(), NULL); + break; + + case 'Q': + case 'q': + if (cur_cmd->a2) + bad_prog(_(ONE_ADDR)); + /* Fall through */ + + case 'L': + case 'l': + ch = in_nonblank(); + if (ISDIGIT(ch) && posixicity != POSIXLY_BASIC) + { + cur_cmd->x.int_arg = in_integer(ch); + ch = in_nonblank(); + } + else + cur_cmd->x.int_arg = -1; + + if (ch == CLOSE_BRACE || ch == '#') + savchar(ch); + else if (ch != EOF && ch != '\n' && ch != ';') + bad_prog(_(EXCESS_JUNK)); + + break; + + case '=': + case 'd': + case 'D': + case 'F': + case 'g': + case 'G': + case 'h': + case 'H': + case 'n': + case 'N': + case 'p': + case 'P': + case 'z': + case 'x': + ch = in_nonblank(); + if (ch == CLOSE_BRACE || ch == '#') + savchar(ch); + else if (ch != EOF && ch != '\n' && ch != ';') + bad_prog(_(EXCESS_JUNK)); + break; + + case 'r': + b = read_filename(); + cur_cmd->x.fname = ck_strdup(get_buffer(b)); + free_buffer(b); + break; + + case 'R': + cur_cmd->x.fp = get_openfile(&file_read, read_mode, false)->fp; + break; + + case 'W': + case 'w': + cur_cmd->x.outf = get_openfile(&file_write, write_mode, true); + break; + + case 's': + { + struct buffer *b2; + int flags; + int slash; + + slash = inchar(); + if ( !(b = match_slash(slash, true)) ) + bad_prog(_(UNTERM_S_CMD)); + if ( !(b2 = match_slash(slash, false)) ) + bad_prog(_(UNTERM_S_CMD)); + + cur_cmd->x.cmd_subst = OB_MALLOC(&obs, 1, struct subst); + setup_replacement(cur_cmd->x.cmd_subst, + get_buffer(b2), size_buffer(b2)); + free_buffer(b2); + + flags = mark_subst_opts(cur_cmd->x.cmd_subst); + cur_cmd->x.cmd_subst->regx = + compile_regex(b, flags, cur_cmd->x.cmd_subst->max_id + 1); + free_buffer(b); + } + break; + + case 'y': + { + size_t len, dest_len; + int slash; + struct buffer *b2; + char *src_buf, *dest_buf; + + slash = inchar(); + if ( !(b = match_slash(slash, false)) ) + bad_prog(_(UNTERM_Y_CMD)); + src_buf = get_buffer(b); + len = normalize_text(src_buf, size_buffer (b), TEXT_BUFFER); + + if ( !(b2 = match_slash(slash, false)) ) + bad_prog(_(UNTERM_Y_CMD)); + dest_buf = get_buffer(b2); + dest_len = normalize_text(dest_buf, size_buffer (b2), TEXT_BUFFER); + + if (mb_cur_max > 1) + { + int i, j, idx, src_char_num; + size_t *src_lens = MALLOC(len, size_t); + char **trans_pairs; + size_t mbclen; + mbstate_t cur_stat; + + /* Enumerate how many character the source buffer has. */ + memset(&cur_stat, 0, sizeof(mbstate_t)); + for (i = 0, j = 0; i < len;) + { + mbclen = MBRLEN (src_buf + i, len - i, &cur_stat); + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 + || mbclen == 0) + mbclen = 1; + src_lens[j++] = mbclen; + i += mbclen; + } + src_char_num = j; + + memset(&cur_stat, 0, sizeof(mbstate_t)); + idx = 0; + + /* trans_pairs = {src(0), dest(0), src(1), dest(1), ..., NULL} + src(i) : pointer to i-th source character. + dest(i) : pointer to i-th destination character. + NULL : terminator */ + trans_pairs = MALLOC(2 * src_char_num + 1, char*); + cur_cmd->x.translatemb = trans_pairs; + for (i = 0; i < src_char_num; i++) + { + if (idx >= dest_len) + bad_prog(_(Y_CMD_LEN)); + + /* Set the i-th source character. */ + trans_pairs[2 * i] = MALLOC(src_lens[i] + 1, char); + strncpy(trans_pairs[2 * i], src_buf, src_lens[i]); + trans_pairs[2 * i][src_lens[i]] = '\0'; + src_buf += src_lens[i]; /* Forward to next character. */ + + /* Fetch the i-th destination character. */ + mbclen = MBRLEN (dest_buf + idx, dest_len - idx, &cur_stat); + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 + || mbclen == 0) + mbclen = 1; + + /* Set the i-th destination character. */ + trans_pairs[2 * i + 1] = MALLOC(mbclen + 1, char); + strncpy(trans_pairs[2 * i + 1], dest_buf + idx, mbclen); + trans_pairs[2 * i + 1][mbclen] = '\0'; + idx += mbclen; /* Forward to next character. */ + } + trans_pairs[2 * i] = NULL; + if (idx != dest_len) + bad_prog(_(Y_CMD_LEN)); + } + else + { + unsigned char *translate = + OB_MALLOC(&obs, YMAP_LENGTH, unsigned char); + unsigned char *ustring = (unsigned char *)src_buf; + + if (len != dest_len) + bad_prog(_(Y_CMD_LEN)); + + for (len = 0; len < YMAP_LENGTH; len++) + translate[len] = len; + + while (dest_len--) + translate[*ustring++] = (unsigned char)*dest_buf++; + + cur_cmd->x.translate = translate; + } + + if ((ch = in_nonblank()) != EOF && ch != '\n' && ch != ';') + bad_prog(_(EXCESS_JUNK)); + + free_buffer(b); + free_buffer(b2); + } + break; + + case EOF: + bad_prog(_(NO_COMMAND)); + /*NOTREACHED*/ + + default: + bad_command (ch); + /*NOTREACHED*/ + } + + /* this is buried down here so that "continue" statements will miss it */ + ++vector->v_length; + } + if (posixicity == POSIXLY_BASIC && pending_text) + bad_prog (_(INCOMPLETE_CMD)); + return vector; +} + + +/* deal with \X escapes */ +size_t +normalize_text(buf, len, buftype) + char *buf; + size_t len; + enum text_types buftype; +{ + const char *bufend = buf + len; + char *p = buf; + char *q = buf; + char ch; + int base; + + /* This variable prevents normalizing text within bracket + subexpressions when conforming to POSIX. If 0, we + are not within a bracket expression. If -1, we are within a + bracket expression but are not within [.FOO.], [=FOO=], + or [:FOO:]. Otherwise, this is the '.', '=', or ':' + respectively within these three types of subexpressions. */ + int bracket_state = 0; + + int mbclen; + mbstate_t cur_stat; + memset(&cur_stat, 0, sizeof(mbstate_t)); + + while (p < bufend) + { + mbclen = MBRLEN (p, bufend - p, &cur_stat); + if (mbclen != 1) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + mbclen = 1; + + memmove (q, p, mbclen); + q += mbclen; + p += mbclen; + continue; + } + + if (*p == '\\' && p+1 < bufend && bracket_state == 0) + switch (*++p) + { +#if defined __STDC__ && __STDC__-0 + case 'a': *q++ = '\a'; p++; continue; +#else /* Not STDC; we'll just assume ASCII */ + case 'a': *q++ = '\007'; p++; continue; +#endif + /* case 'b': *q++ = '\b'; p++; continue; --- conflicts with \b RE */ + case 'f': *q++ = '\f'; p++; continue; + case '\n': /*fall through */ + case 'n': *q++ = '\n'; p++; continue; + case 'r': *q++ = '\r'; p++; continue; + case 't': *q++ = '\t'; p++; continue; + case 'v': *q++ = '\v'; p++; continue; + + case 'd': /* decimal byte */ + base = 10; + goto convert; + + case 'x': /* hexadecimal byte */ + base = 16; + goto convert; + +#ifdef REG_PERL + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + if ((extended_regexp_flags & REG_PERL) + && p+1 < bufend + && p[1] >= '0' && p[1] <= '9') + { + base = 8; + goto convert; + } + else + { + /* we just pass the \ up one level for interpretation */ + if (buftype != TEXT_BUFFER) + *q++ = '\\'; + } + + continue; + + case 'o': /* octal byte */ + if (!(extended_regexp_flags & REG_PERL)) + { + base = 8; + goto convert; + } + else + { + /* we just pass the \ up one level for interpretation */ + if (buftype != TEXT_BUFFER) + *q++ = '\\'; + } + + continue; +#else + case 'o': /* octal byte */ + base = 8; +#endif +convert: + p = convert_number(&ch, p, bufend, base); + + /* for an ampersand in a replacement, pass the \ up one level */ + if (buftype == TEXT_REPLACEMENT && ch == '&') + *q++ = '\\'; + *q++ = ch; + continue; + + case 'c': + if (++p < bufend) + { + *q++ = toupper((unsigned char) *p) ^ 0x40; + p++; + continue; + } + else + { + /* we just pass the \ up one level for interpretation */ + if (buftype != TEXT_BUFFER) + *q++ = '\\'; + continue; + } + + default: + /* we just pass the \ up one level for interpretation */ + if (buftype != TEXT_BUFFER) + *q++ = '\\'; + break; + } + else if (buftype == TEXT_REGEX && posixicity != POSIXLY_EXTENDED) + switch (*p) + { + case '[': + if (!bracket_state) + bracket_state = -1; + break; + + case ':': + case '.': + case '=': + if (bracket_state == -1 && p[-1] == '[') + bracket_state = *p; + break; + + case ']': + if (bracket_state == 0) + ; + else if (bracket_state == -1) + bracket_state = 0; + else if (p[-2] != bracket_state && p[-1] == bracket_state) + bracket_state = -1; + break; + } + + *q++ = *p++; + } + return (size_t)(q - buf); +} + + +/* `str' is a string (from the command line) that contains a sed command. + Compile the command, and add it to the end of `cur_program'. */ +struct vector * +compile_string(cur_program, str, len) + struct vector *cur_program; + char *str; + size_t len; +{ + static countT string_expr_count = 0; + struct vector *ret; + + prog.file = NULL; + prog.base = (unsigned char *)str; + prog.cur = prog.base; + prog.end = prog.cur + len; + + cur_input.line = 0; + cur_input.name = NULL; + cur_input.string_expr_count = ++string_expr_count; + + ret = compile_program(cur_program); + prog.base = NULL; + prog.cur = NULL; + prog.end = NULL; + + first_script = false; + return ret; +} + +/* `cmdfile' is the name of a file containing sed commands. + Read them in and add them to the end of `cur_program'. + */ +struct vector * +compile_file(cur_program, cmdfile) + struct vector *cur_program; + const char *cmdfile; +{ + struct vector *ret; + + prog.file = stdin; + if (cmdfile[0] != '-' || cmdfile[1] != '\0') + { +#ifdef HAVE_FOPEN_RT + prog.file = ck_fopen(cmdfile, "rt", true); +#else + prog.file = ck_fopen(cmdfile, "r", true); +#endif + } + + cur_input.line = 1; + cur_input.name = cmdfile; + cur_input.string_expr_count = 0; + + ret = compile_program(cur_program); + if (prog.file != stdin) + ck_fclose(prog.file); + prog.file = NULL; + + first_script = false; + return ret; +} + +/* Make any checks which require the whole program to have been read. + In particular: this backpatches the jump targets. + Any cleanup which can be done after these checks is done here also. */ +void +check_final_program(program) + struct vector *program; +{ + struct sed_label *go; + struct sed_label *lbl; + + /* do all "{"s have a corresponding "}"? */ + if (blocks) + { + /* update info for error reporting: */ + memcpy(&cur_input, &blocks->err_info, sizeof (cur_input)); + bad_prog(_(EXCESS_OPEN_BRACE)); + } + + /* was the final command an unterminated a/c/i command? */ + if (pending_text) + { + old_text_buf->text_length = size_buffer(pending_text); + if (old_text_buf->text_length) + old_text_buf->text = MEMDUP(get_buffer(pending_text), + old_text_buf->text_length, char); + free_buffer(pending_text); + pending_text = NULL; + } + + for (go = jumps; go; go = release_label(go)) + { + for (lbl = labels; lbl; lbl = lbl->next) + if (strcmp(lbl->name, go->name) == 0) + break; + if (lbl) + { + program->v[go->v_index].x.jump_index = lbl->v_index; + } + else + { + if (*go->name) + panic(_("can't find label for jump to `%s'"), go->name); + program->v[go->v_index].x.jump_index = program->v_length; + } + } + jumps = NULL; + + for (lbl = labels; lbl; lbl = release_label(lbl)) + ; + labels = NULL; + + /* There is no longer a need to track file names: */ + { + struct output *p; + + for (p=file_read; p; p=p->link) + if (p->name) + { + free(p->name); + p->name = NULL; + } + + for (p=file_write; p; p=p->link) + if (p->name) + { + free(p->name); + p->name = NULL; + } + } +} + +/* Rewind all resources which were allocated in this module. */ +void +rewind_read_files() +{ + struct output *p; + + for (p=file_read; p; p=p->link) + if (p->fp) + rewind(p->fp); +} + +/* Release all resources which were allocated in this module. */ +void +finish_program(program) + struct vector *program; +{ + /* close all files... */ + { + struct output *p, *q; + + for (p=file_read; p; p=q) + { + if (p->fp) + ck_fclose(p->fp); + q = p->link; +#if 0 + /* We use obstacks. */ + free(p); +#endif + } + + for (p=file_write; p; p=q) + { + if (p->fp) + ck_fclose(p->fp); + q = p->link; +#if 0 + /* We use obstacks. */ + free(p); +#endif + } + file_read = file_write = NULL; + } + +#ifdef DEBUG_LEAKS + obstack_free (&obs, NULL); +#endif /*DEBUG_LEAKS*/ +} |