diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2023-03-16 18:25:43 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2023-03-16 18:25:43 +0200 |
commit | 2e18b77f5b6926e6616ce22d2d3e6d511de69c9b (patch) | |
tree | dfb889b9411a7f126facaeb465aebb2247961d72 | |
parent | ed09e9c66665f98eb070cc28d87abb9cb0096c3f (diff) | |
download | gawk-2e18b77f5b6926e6616ce22d2d3e6d511de69c9b.tar.gz |
Start revamp of CSV handling.
-rw-r--r-- | ChangeLog | 17 | ||||
-rw-r--r-- | awk.h | 4 | ||||
-rw-r--r-- | field.c | 30 | ||||
-rw-r--r-- | io.c | 45 | ||||
-rw-r--r-- | main.c | 18 | ||||
-rw-r--r-- | pc/ChangeLog | 4 | ||||
-rw-r--r-- | pc/Makefile.tst | 7 | ||||
-rw-r--r-- | test/ChangeLog | 7 | ||||
-rwxr-xr-x | test/Gentests | 11 | ||||
-rw-r--r-- | test/Makefile.am | 5 | ||||
-rw-r--r-- | test/Makefile.in | 7 | ||||
-rw-r--r-- | test/Maketests | 2 | ||||
-rw-r--r-- | test/badargs.ok | 1 | ||||
-rw-r--r-- | test/csv1.awk | 6 |
14 files changed, 148 insertions, 16 deletions
@@ -1,3 +1,20 @@ +2023-03-16 Arnold D. Robbins <arnold@skeeve.com> + + * awk.h (enum do_flag_values): Add DO_CSV. + (do_csv): New macro. + (init_csv_fields, init_csv_records): Add declarations. + * field.c (init_csv_fields): New function. + (set_parser): Don't set the parser if doing CSV. Add warnings. + * io.c (csvscan): New function (placeholder for now). + (init_csv_records): New function. + (set_RS): Don't set the parser if doing CSV. Add warnings. + * main.c (optab): Add new options -k/--csv. + (main): Fatal out if --posix and --csv. Call init_csv_records() + and init_csv_fields(). + (usage): Add a line for the new options. + (load_procinfo): Install PROCINFO["CSV"] if doing CSV. + (parse_args): Update for new options. + 2023-03-09 Arnold D. Robbins <arnold@skeeve.com> * gawkapi.h: Update copyright year. Small edit in leading comment. @@ -1173,6 +1173,7 @@ extern enum do_flag_values { DO_PROFILE = 0x02000, /* profile the program */ DO_DEBUG = 0x04000, /* debug the program */ DO_MPFR = 0x08000, /* arbitrary-precision floating-point math */ + DO_CSV = 0x10000, /* process comma-separated-value files */ } do_flags; #define do_traditional (do_flags & DO_TRADITIONAL) @@ -1187,6 +1188,7 @@ extern enum do_flag_values { #define do_sandbox (do_flags & DO_SANDBOX) #define do_debug (do_flags & DO_DEBUG) #define do_mpfr (do_flags & DO_MPFR) +#define do_csv (do_flags & DO_CSV) extern bool do_optimize; extern int use_lc_numeric; @@ -1569,6 +1571,7 @@ extern NODE *get_actual_argument(NODE *, int, bool); #endif /* field.c */ extern void init_fields(void); +extern void init_csv_fields(void); extern void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *); extern void reset_record(void); extern void rebuild_record(void); @@ -1629,6 +1632,7 @@ extern int isdirpunct(int c); /* io.c */ extern void init_sockets(void); extern void init_io(void); +extern void init_csv_records(void); extern void register_input_parser(awk_input_parser_t *input_parser); extern void register_output_wrapper(awk_output_wrapper_t *wrapper); extern void register_two_way_processor(awk_two_way_processor_t *processor); @@ -114,6 +114,15 @@ init_fields() field0_valid = true; } +/* init_csv_fields --- set up to handle --csv */ + +void +init_csv_fields(void) +{ + if (do_csv) + parse_field = comma_parse_field; +} + /* grow_fields --- acquire new fields as needed */ static void @@ -771,6 +780,7 @@ sc_parse_field(long up_to, /* parse only up to this field number */ * via (*parse_field)(). This variation is for when FS is a comma, * we do very basic CSV parsing, the same as BWK awk. */ + static long comma_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ @@ -1285,11 +1295,29 @@ do_patsplit(int nargs) static void set_parser(parse_field_func_t func) { + /* + * Setting FS does nothing if CSV mode, warn in that case, + * but don't warn on first call which happens at initialization. + */ + static bool first_time = true; + static bool warned = false; + + if (! first_time && do_csv) { + if (! warned) { + warned = true; + warning(_("assignment to FS/FIELDWIDTHS/FPAT has no effect when using --csv")); + } + return; + } + normal_parse_field = func; if (! api_parser_override && parse_field != func) { parse_field = func; update_PROCINFO_str("FS", current_field_sep_str()); } + + if (first_time) + first_time = false; } /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */ @@ -1503,8 +1531,6 @@ choose_fs_function: else if (fs->stptr[0] == '\\') /* same special case */ strcpy(buf, "[\\\\]"); - else if (fs->stptr[0] == ',' && ! do_posix) - set_parser(comma_parse_field); else set_parser(sc_parse_field); } @@ -265,6 +265,7 @@ static bool avoid_flush(const char *name); static RECVALUE rs1scan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state); static RECVALUE rsnullscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state); static RECVALUE rsrescan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state); +static RECVALUE csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state); static RECVALUE (*matchrec)(IOBUF *iop, struct recmatch *recm, SCANSTATE *state) = rs1scan; @@ -341,6 +342,15 @@ init_io() read_can_timeout = true; } +/* init_csv_records --- set up for CSV handling */ + +void +init_csv_records(void) +{ + if (do_csv) + matchrec = csvscan; +} + #if defined(__MINGW32__) || defined(__CYGWIN__) /* binmode --- convert BINMODE to string for fopen */ @@ -3820,6 +3830,14 @@ find_longest_terminator: return REC_OK; } +/* csvscan --- handle --csv mode */ + +static RECVALUE +csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state) +{ + return rs1scan(iop, recm, state); // XXX so it'll compile and run +} + /* retryable --- return true if PROCINFO[<filename>, "RETRY"] exists */ static inline int @@ -4069,6 +4087,13 @@ get_a_record(char **out, /* pointer to pointer to data */ void set_RS() { + /* + * Setting RS does nothing if CSV mode, warn in that case, + * but don't warn on first call which happens at initialization. + */ + static bool first_time = true; + static bool warned = false; + static NODE *save_rs = NULL; /* @@ -4099,9 +4124,15 @@ set_RS() refree(RS_re[1]); RS_re[0] = RS_re[1] = RS_regexp = NULL; + if (! first_time && ! warned && do_csv) { + warned = true; + warning(_("assignment to RS has no effect when using --csv")); + } + if (RS->stlen == 0) { RS_is_null = true; - matchrec = rsnullscan; + if (first_time || ! do_csv) + matchrec = rsnullscan; } else if ((RS->stlen > 1 || (RS->flags & REGEX) != 0) && ! do_traditional) { static bool warned = false; @@ -4109,17 +4140,23 @@ set_RS() RS_re[1] = make_regexp(RS->stptr, RS->stlen, true, true, true); RS_regexp = RS_re[IGNORECASE]; - matchrec = rsrescan; + if (first_time || ! do_csv) + matchrec = rsrescan; if (do_lint_extensions && ! warned) { lintwarn(_("multicharacter value of `RS' is a gawk extension")); warned = true; } - } else - matchrec = rs1scan; + } else { + if (first_time || ! do_csv) + matchrec = rs1scan; + } set_FS: if (current_field_sep() == Using_FS) set_FS(); + + if (first_time) + first_time = false; } @@ -171,6 +171,7 @@ static const struct option optab[] = { { "bignum", no_argument, NULL, 'M' }, { "characters-as-bytes", no_argument, & do_binary, 'b' }, { "copyright", no_argument, NULL, 'C' }, + { "csv", no_argument, NULL, 'k' }, { "debug", optional_argument, NULL, 'D' }, { "dump-variables", optional_argument, NULL, 'd' }, { "exec", required_argument, NULL, 'E' }, @@ -375,6 +376,9 @@ main(int argc, char **argv) } } + if (do_csv && do_posix) + fatal(_("`--posix' and `--csv' conflict")); + if (do_lint) { if (os_is_setuid()) lintwarn(_("running %s setuid root may be a security problem"), myname); @@ -415,6 +419,10 @@ main(int argc, char **argv) /* Set up the special variables */ init_vars(); + /* set up CSV */ + init_csv_records(); + init_csv_fields(); + /* Set up the field variables */ init_fields(); @@ -624,6 +632,7 @@ usage(int exitval, FILE *fp) fputs(_("\t-h\t\t\t--help\n"), fp); fputs(_("\t-i includefile\t\t--include=includefile\n"), fp); fputs(_("\t-I\t\t\t--trace\n"), fp); + fputs(_("\t-k\t\t\t--csv\n"), fp); fputs(_("\t-l library\t\t--load=library\n"), fp); /* * TRANSLATORS: the "fatal", "invalid" and "no-ext" here are literal @@ -1105,6 +1114,9 @@ load_procinfo() update_PROCINFO_str("pma", get_pma_version()); #endif /* USE_PERSISTENT_MALLOC */ + if (do_csv) + update_PROCINFO_num("CSV", 1); + load_procinfo_argv(); return PROCINFO_node; } @@ -1569,7 +1581,7 @@ parse_args(int argc, char **argv) /* * The + on the front tells GNU getopt not to rearrange argv. */ - const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:Il:L::nNo::Op::MPrSstVYZ:"; + const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:kIl:L::nNo::Op::MPrSstVYZ:"; int old_optind; int c; char *scan; @@ -1668,6 +1680,10 @@ parse_args(int argc, char **argv) do_itrace = true; break; + case 'k': // k is for "comma". it's a stretch, I know + do_flags |= DO_CSV; + break; + case 'l': (void) add_srcfile(SRC_EXTLIB, optarg, srcfiles, NULL, NULL); break; diff --git a/pc/ChangeLog b/pc/ChangeLog index e809bfac..541be9c5 100644 --- a/pc/ChangeLog +++ b/pc/ChangeLog @@ -1,3 +1,7 @@ +2023-03-16 Arnold D. Robbins <arnold@skeeve.com> + + * Makefile.tst: Regenerated. + 2023-03-12 Eli Zaretskii <eliz@gnu.org> * Makefile.ext (readdir_test.$(SOEXT)): Fix typo. diff --git a/pc/Makefile.tst b/pc/Makefile.tst index 316d778c..72f8a9cb 100644 --- a/pc/Makefile.tst +++ b/pc/Makefile.tst @@ -288,9 +288,12 @@ NEED_SANDBOX = sandbox1 # List of tests that need --traditional NEED_TRADITIONAL = litoct tradanch rscompat -# Lists of tests that need the PMA allocator and a backing file +# List of tests that need the PMA allocator and a backing file NEED_PMA = pma +# List of tests that need --csv +NEED_CSV = csv1 + # Lists of tests that run a shell script RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01 @@ -2721,7 +2724,7 @@ crlf: csv1: @echo $@ - @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ dbugeval2: diff --git a/test/ChangeLog b/test/ChangeLog index e206fa9b..f247c540 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,10 @@ +2023-03-16 Arnold D. Robbins <arnold@skeeve.com> + + * Gentests: Handle NEED_CSV. + * Makefile.am (NEED_CSV): New list of tests that need --csv. + * badargs.ok: Update after code changes. + * csv1.awk: Adjust after code changes. + 2023-03-09 Arnold D. Robbins <arnold@skeeve.com> * badargs.ok: Update after code changes. diff --git a/test/Gentests b/test/Gentests index 42a81cff..b3a8f787 100755 --- a/test/Gentests +++ b/test/Gentests @@ -108,6 +108,13 @@ BEGIN { next } +/^NEED_CSV *=/,/[^\\]$/ { + gsub(/(^NEED_CSV *=|\\$)/,"") + for (i = 1; i <= NF; i++) + csv[$i] + next +} + /^GENTESTS_UNUSED *=/,/[^\\]$/ { gsub(/(^GENTESTS_UNUSED *=|\\$)/,"") for (i = 1; i <= NF; i++) @@ -229,6 +236,10 @@ function generate(x, s, i, locale_string) s = s " --re-interval" delete re_interval[x] } + if (x in csv) { + s = s " --csv" + delete csv[x] + } if (x".in" in files) { s = s " < \"$(srcdir)\"/$@.in" delete files[x".in"] diff --git a/test/Makefile.am b/test/Makefile.am index 6d000178..7bb4c983 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1604,9 +1604,12 @@ NEED_SANDBOX = sandbox1 # List of tests that need --traditional NEED_TRADITIONAL = litoct tradanch rscompat -# Lists of tests that need the PMA allocator and a backing file +# List of tests that need the PMA allocator and a backing file NEED_PMA = pma +# List of tests that need --csv +NEED_CSV = csv1 + # Lists of tests that run a shell script RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01 diff --git a/test/Makefile.in b/test/Makefile.in index 1cd775d1..28c5ebc5 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -1868,9 +1868,12 @@ NEED_SANDBOX = sandbox1 # List of tests that need --traditional NEED_TRADITIONAL = litoct tradanch rscompat -# Lists of tests that need the PMA allocator and a backing file +# List of tests that need the PMA allocator and a backing file NEED_PMA = pma +# List of tests that need --csv +NEED_CSV = csv1 + # Lists of tests that run a shell script RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01 @@ -4484,7 +4487,7 @@ crlf: csv1: @echo $@ - @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ dbugeval2: diff --git a/test/Maketests b/test/Maketests index 628ff3fa..d284aab6 100644 --- a/test/Maketests +++ b/test/Maketests @@ -1414,7 +1414,7 @@ crlf: csv1: @echo $@ - @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ dbugeval2: diff --git a/test/badargs.ok b/test/badargs.ok index d2c67cac..1d79bc78 100644 --- a/test/badargs.ok +++ b/test/badargs.ok @@ -17,6 +17,7 @@ Short options: GNU long options: (extensions) -h --help -i includefile --include=includefile -I --trace + -k --csv -l library --load=library -L[fatal|invalid|no-ext] --lint[=fatal|invalid|no-ext] -M --bignum diff --git a/test/csv1.awk b/test/csv1.awk index 12bbf1e5..4896ef7c 100644 --- a/test/csv1.awk +++ b/test/csv1.awk @@ -1,6 +1,6 @@ -BEGIN { - FS = "," -} +# BEGIN { +# FS = "," +# } { printf(" \t%s\t", $0) |