summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2010-07-16 12:04:45 +0300
committerArnold D. Robbins <arnold@skeeve.com>2010-07-16 12:04:45 +0300
commitdbd583bd2b8a6dd40c622875a4e197360cb5aba7 (patch)
treed9fb7b6595cb44fefb4e32d70af9ac6d057af14a
parentb8c608200919aa3f7b3fef289a7bece2d2961412 (diff)
downloadgawk-dbd583bd2b8a6dd40c622875a4e197360cb5aba7.tar.gz
Move to 2.13.3 (from 2.13.tar.gz - sigh).gawk-2.13.3
-rw-r--r--CHANGES58
-rw-r--r--COPYING355
-rw-r--r--FUTURES6
-rw-r--r--LIMITATIONS10
-rw-r--r--Makefile-dist255
-rw-r--r--Makefile.in (renamed from Makefile)27
-rw-r--r--PROBLEMS4
-rw-r--r--README18
-rw-r--r--array.c6
-rw-r--r--atari/Makefile.st6
-rw-r--r--awk.h9
-rw-r--r--awk.tab.c88
-rw-r--r--awk.y54
-rw-r--r--builtin.c12
-rw-r--r--config.h287
-rw-r--r--config.h.in (renamed from config.h-dist)0
-rwxr-xr-xconfigure8
-rw-r--r--eval.c26
-rw-r--r--field.c100
-rw-r--r--gawk.texinfo3464
-rw-r--r--io.c119
-rw-r--r--iop.c236
-rw-r--r--main.c50
-rw-r--r--missing/dup2.c12
-rw-r--r--missing/gcvt.c9
-rw-r--r--missing/getopt.c6
-rw-r--r--missing/memcmp.c4
-rw-r--r--missing/random.c16
-rw-r--r--missing/strcase.c10
-rw-r--r--missing/strerror.c13
-rw-r--r--missing/strftime.3259
-rw-r--r--missing/strftime.c384
-rw-r--r--missing/strtod.c5
-rw-r--r--missing/strtol.c120
-rw-r--r--missing/system.c7
-rw-r--r--missing/tmpnam.c27
-rw-r--r--missing/tzset.c38
-rw-r--r--missing/vprintf.c9
-rwxr-xr-xmkconf8
-rw-r--r--msg.c36
-rw-r--r--node.c12
-rw-r--r--patchlevel.h2
-rw-r--r--pc/make.bat6
-rw-r--r--protos.h7
-rw-r--r--re.c11
-rw-r--r--regex.c5135
-rw-r--r--regex.h666
-rw-r--r--support/makeinfo.patch233
-rw-r--r--support/texindex.c190
-rw-r--r--support/texinfo.tex1182
-rw-r--r--vms/gawk.hlp116
-rw-r--r--vms/vms.h15
-rw-r--r--vms/vms_args.c40
-rw-r--r--vms/vms_fwrite.c6
-rw-r--r--vms/vms_gawk.c6
-rw-r--r--vms/vms_misc.c6
-rw-r--r--vms/vms_popen.c199
57 files changed, 10309 insertions, 3684 deletions
diff --git a/CHANGES b/CHANGES
index 0d7396ca..fa862323 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,61 @@
+Changes from 2.13.2 to 2.13.3
+-----------------------------
+
+Updated manual!
+
+Error messages now conform to GNU standard (I hope).
+
+The length of a string was a short and now is a size_t.
+
+Updated VMS help.
+
+Added a few new tests to the test suite.
+
+Regexp can (once again) match a newline, if given explicitly.
+
+Fixed VMS pipe simulation.
+
+Fixed bug in getline without redirect from a file -- it was quitting after the
+ first EOF, rather than trying the next file.
+
+Fixed bug in treatment of backslash at the end of a string -- it was bombing
+ rather than doing something sensible. It is not clear what this should mean,
+ but for now I issue a warning and take it as a literal backslash.
+
+Moved setting of regexp syntax to before the option parsing in main(), to
+ handle things like -v FS='[.,;]'
+
+Fixed bug when NF is set by user -- fields_arr must be expanded if necessary.
+
+Fixed bug for [g]sub() where no match found and for zero-length string.
+
+make test does not assume the . is in PATH.
+
+Fixed bug for assignment to field beyond end of record -- the assigned value
+ was not found on subsequent reference to that field.
+
+Fixed bug for FS a regexp and it matches at the end of a record.
+
+Fixed memory leak for an array local to a function.
+
+Fixed hanging of pipe redirection to getline
+
+Fixed coredump on access to $0 inside BEGIN block.
+
+Fixed treatment of RS = "". It now parses the fields correctly and strips
+ leading whitspace from a record if FS is a space.
+
+Fixed faking of /dev/stdin.
+
+Fixed problem with x += x
+
+Use of scalar as array and vice versa is now detected.
+
+Switch to GPL version 2.
+
+More "lint" checking.
+
+
Changes from 2.13.1 to 2.13.2
-----------------------------
diff --git a/COPYING b/COPYING
index 9a170375..3358a7be 100644
--- a/COPYING
+++ b/COPYING
@@ -1,38 +1,40 @@
-
GNU GENERAL PUBLIC LICENSE
- Version 1, February 1989
+ Version 2, June 1991
- Copyright (C) 1989 Free Software Foundation, Inc.
- 675 Mass Ave, Cambridge, MA 02139, USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 675 Mass Ave, Cambridge, MA 02139, USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
- The license agreements of most software companies try to keep users
-at the mercy of those companies. By contrast, our General Public
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users. The
-General Public License applies to the Free Software Foundation's
-software and to any other program whose authors commit to using it.
-You can use it for your programs, too.
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
When we speak of free software, we are referring to freedom, not
-price. Specifically, the General Public License is designed to make
-sure that you have the freedom to give away or sell copies of free
-software, that you receive source code or can get it if you want it,
-that you can change the software or use pieces of it in new free
-programs; and that you know you can do these things.
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
- For example, if you distribute copies of a such a program, whether
+ For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
-source code. And you must tell them their rights.
+source code. And you must show them these terms so they know their
+rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
@@ -45,120 +47,207 @@ want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
- 0. This License Agreement applies to any program or other work which
-contains a notice placed by the copyright holder saying it may be
-distributed under the terms of this General Public License. The
-"Program", below, refers to any such program or work, and a "work based
-on the Program" means either the Program or any work containing the
-Program or a portion of it, either verbatim or with modifications. Each
-licensee is addressed as "you".
-
- 1. You may copy and distribute verbatim copies of the Program's source
-code as you receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice and
-disclaimer of warranty; keep intact all the notices that refer to this
-General Public License and to the absence of any warranty; and give any
-other recipients of the Program a copy of this General Public License
-along with the Program. You may charge a fee for the physical act of
-transferring a copy.
-
- 2. You may modify your copy or copies of the Program or any portion of
-it, and copy and distribute such modifications under the terms of Paragraph
-1 above, provided that you also do the following:
-
- a) cause the modified files to carry prominent notices stating that
- you changed the files and the date of any change; and
-
- b) cause the whole of any work that you distribute or publish, that
- in whole or in part contains the Program or any part thereof, either
- with or without modifications, to be licensed at no charge to all
- third parties under the terms of this General Public License (except
- that you may choose to grant warranty protection to some or all
- third parties, at your option).
-
- c) If the modified program normally reads commands interactively when
- run, you must cause it, when started running for such interactive use
- in the simplest and most usual way, to print or display an
- announcement including an appropriate copyright notice and a notice
- that there is no warranty (or else, saying that you provide a
- warranty) and that users may redistribute the program under these
- conditions, and telling the user how to view a copy of this General
- Public License.
-
- d) You may charge a fee for the physical act of transferring a
- copy, and you may at your option offer warranty protection in
- exchange for a fee.
-
-Mere aggregation of another independent work with the Program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other work under the scope of these terms.
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
- 3. You may copy and distribute the Program (or a portion or derivative of
-it, under Paragraph 2) in object code or executable form under the terms of
-Paragraphs 1 and 2 above provided that you also do one of the following:
-
- a) accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- b) accompany it with a written offer, valid for at least three
- years, to give any third party free (except for a nominal charge
- for the cost of distribution) a complete machine-readable copy of the
- corresponding source code, to be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- c) accompany it with the information you received as to where the
- corresponding source code may be obtained. (This alternative is
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
- received the program in object code or executable form alone.)
-
-Source code for a work means the preferred form of the work for making
-modifications to it. For an executable file, complete source code means
-all the source code for all modules it contains; but, as a special
-exception, it need not include source code for modules which are standard
-libraries that accompany the operating system on which the executable
-file runs, or for standard header files or definitions files that
-accompany that operating system.
-
- 4. You may not copy, modify, sublicense, distribute or transfer the
-Program except as expressly provided under this General Public License.
-Any attempt otherwise to copy, modify, sublicense, distribute or transfer
-the Program is void, and will automatically terminate your rights to use
-the Program under this License. However, parties who have received
-copies, or rights to use copies, from you under this General Public
-License will not have their licenses terminated so long as such parties
-remain in full compliance.
-
- 5. By copying, distributing or modifying the Program (or any work based
-on the Program) you indicate your acceptance of this license to do so,
-and all its terms and conditions.
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the original
-licensor to copy, distribute or modify the Program subject to these
-terms and conditions. You may not impose any further restrictions on the
-recipients' exercise of the rights granted herein.
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
- 7. The Free Software Foundation may publish revised and/or new versions
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
-specifies a version number of the license which applies to it and "any
+specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
-the license, you may choose any version ever published by the Free Software
+this License, you may choose any version ever published by the Free Software
Foundation.
- 8. If you wish to incorporate parts of the Program into other free
+ 10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
@@ -168,7 +257,7 @@ of promoting the sharing and reuse of software generally.
NO WARRANTY
- 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
@@ -178,7 +267,7 @@ TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
- 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
@@ -193,22 +282,21 @@ POSSIBILITY OF SUCH DAMAGES.
Appendix: How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
-possible use to humanity, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these
-terms.
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
- To do so, attach the following notices to the program. It is safest to
-attach them to the start of each source file to most effectively convey
-the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) 19yy <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 1, or (at your option)
- any later version.
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -224,26 +312,29 @@ Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
- Gnomovision version 69, Copyright (C) 19xx name of author
+ Gnomovision version 69, Copyright (C) 19yy name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
-The hypothetical commands `show w' and `show c' should show the
-appropriate parts of the General Public License. Of course, the
-commands you use may be called something other than `show w' and `show
-c'; they could even be mouse-clicks or menu items--whatever suits your
-program.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
-necessary. Here a sample; alter the names:
+necessary. Here is a sample; alter the names:
- Yoyodyne, Inc., hereby disclaims all copyright interest in the
- program `Gnomovision' (a program to direct compilers to make passes
- at assemblers) written by James Hacker.
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
-That's all there is to it!
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/FUTURES b/FUTURES
index 9f906ff2..1c63dcd7 100644
--- a/FUTURES
+++ b/FUTURES
@@ -32,7 +32,7 @@ Update regex.h flags for AWK and feed back to FSF
Feedback alloca.s changes to FSF
-Brief manual clean up
+10/28/91: DONE: Brief manual clean up
4/18/91: DONE: Provide a list of systems
a) where gawk has been ported
@@ -51,7 +51,7 @@ Extensible hashing and on-disk storage of awk arrays
Add |&
-Warn or fatal if identifier used as both variable and array
+10/28/91: DONE: Warn or fatal if identifier used as both variable and array
Split() with null string as third arg to split up strings
@@ -61,7 +61,7 @@ RECLEN variable for fixed length records
Make awk '/foo/' files... run like egrep
-Extensive manual cleanup:
+10/28/91: DONE: Extensive manual cleanup:
Use of texinfo 2.0 features
diff --git a/LIMITATIONS b/LIMITATIONS
index 8184c87a..5877197a 100644
--- a/LIMITATIONS
+++ b/LIMITATIONS
@@ -1,14 +1,12 @@
This file describes limits of gawk on a Unix system (although it
is variable even then). Non-Unix systems may have other limits.
-All cases below where MAX_SHORT is specified will be increased
-to MAX_INT in a future release.
# of fields in a record: MAX_INT
-Length of input record: MAX_SHORT
+Length of input record: MAX_INT
Length of output record: unlimited
-Size of a field: MAX_SHORT
-Size of a printf string: MAX_SHORT
-Size of a literal string: MAX_SHORT
+Size of a field: MAX_INT
+Size of a printf string: MAX_INT
+Size of a literal string: MAX_INT
Characters in a character class: 2^(# of bits per byte)
# of file redirections: unlimited
# of pipe redirections: min(# of processes per user, # of open files)
diff --git a/Makefile-dist b/Makefile-dist
deleted file mode 100644
index 2a7d6ab6..00000000
--- a/Makefile-dist
+++ /dev/null
@@ -1,255 +0,0 @@
-# Makefile for GNU Awk.
-#
-# Copyright (C) 1986, 1988, 1989 the Free Software Foundation, Inc.
-#
-# This file is part of GAWK, the GNU implementation of the
-# AWK Progamming Language.
-#
-# GAWK is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 1, or (at your option)
-# any later version.
-#
-# GAWK is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with GAWK; see the file COPYING. If not, write to
-# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-
-# User tunable macros -- CHANGE THESE IN Makefile-dist RATHER THAN IN
-# Makefile, OR configure WILL OVERWRITE YOUR CHANGES
-
-DESTDIR=
-BINDIR= /usr/local/bin
-MANDIR= /usr/man/manl
-MANEXT= l
-
-# CFLAGS: options to the C compiler
-#
-# -O optimize
-# -g include dbx/sdb info
-# -pg include new (gmon) profiling info
-#
-# The provided "configure" is used to turn a config file (samples in
-# the "config" directory into commands to edit config.h-dist into
-# a suitable config.h and to edit Makefile-dist into Makefile.
-# To port GAWK, create an appropriate config file using the ones in
-# the config directory as examples and using the comments in config.h-dist
-# as a guide.
-#
-
-CC= cc
-
-OPTIMIZE= -g #-O -fstrength-reduce
-PROFILE= #-pg
-DEBUG= #-DMALLOCDEBUG #-DMEMDEBUG #-DDEBUG #-DFUNC_TRACE #-DMPROF
-LINKSTATIC= #-Bstatic
-WARN= #-W -Wunused -Wimplicit -Wreturn-type -Wcomment # for gcc only
-
-# Parser to use on grammar - any one of the following will work
-PARSER = yacc
-#PARSER = byacc
-#PARSER = bison -y
-
-# Set LIBS to any libraries that are machine specific
-LIBS =
-
-# Cray 2 running Unicos 5.0.7
-##MAKE_LIBNET## LIBS = -lnet
-
-##MAKE_NeXT## FLAGS = -DGFMT_WORKAROUND
-
-# Systems with alloca in /lib/libPW.a
-##MAKE_ALLOCA_PW## LIBS = -lPW
-
-# ALLOCA - only needed if you use bison
-# Set equal to alloca.o if your system is S5 and you don't have
-# alloca. Uncomment one of the rules below to make alloca.o from
-# either alloca.s or alloca.c.
-# This should have already been done automatically by configure.
-#
-# Some systems have alloca in libPW.a, so LIBS=-lPW may work, too.
-##MAKE_ALLOCA_C## ALLOCA= alloca.o
-##MAKE_ALLOCA_S## ALLOCA= alloca.o
-
-FLAGS=
-##MAKE_RS6000## FLAGS = -qchars=signed
-
-# HP/Apollo running cc version 6.7 or earlier
-##MAKE_Apollo## FLAGS = -U__STDC__ -A run,sys5.3
-##MAKE_Apollo## LIBS = -A sys,any
-
-CFLAGS= $(FLAGS) $(DEBUG) $(LINKSTATIC) $(PROFILE) $(OPTIMIZE) $(WARN)
-
-# object files
-AWKOBJS = main.o eval.o builtin.o msg.o iop.o io.o field.o array.o \
- node.o version.o missing.o re.o
-
-ALLOBJS = $(AWKOBJS) awk.tab.o
-
-# GNUOBJS
-# GNU stuff that gawk uses as library routines.
-GNUOBJS= regex.o dfa.o $(ALLOCA)
-
-# source and documentation files
-SRC = main.c eval.c builtin.c msg.c version.c \
- iop.c io.c field.c array.c node.c missing.c re.c
-
-ALLSRC= $(SRC) awk.tab.c
-
-AWKSRC= awk.h awk.y $(ALLSRC) patchlevel.h protos.h config.h-dist
-
-GNUSRC = alloca.c alloca.s dfa.c dfa.h regex.c regex.h
-
-COPIES = missing/getopt.c missing/system.c missing/tzset.c \
- missing/memcmp.c missing/memcpy.c missing/memset.c \
- missing/random.c missing/strcase.c missing/strchr.c \
- missing/strerror.c missing/strtod.c missing/vprintf.c \
- missing/strftime.c missing/strftime.3 missing/strtol.c
-
-SUPPORT = support/texindex.c support/texinfo.tex
-
-DOCS= gawk.1 gawk.texinfo
-
-INFOFILES= gawk-info gawk-info-1 gawk-info-2 gawk-info-3 gawk-info-4 \
- gawk-info-5 gawk-info-6 gawk.aux gawk.cp gawk.cps gawk.fn \
- gawk.fns gawk.ky gawk.kys gawk.pg gawk.pgs gawk.toc \
- gawk.tp gawk.tps gawk.vr gawk.vrs
-
-MISC = CHANGES COPYING FUTURES Makefile-dist PROBLEMS README* PORTS mkconf \
- mungeconf configure ACKNOWLEDGMENT LIMITATIONS
-
-OTHERS= pc/* atari/* vms/*
-
-ALLDOC= gawk.dvi $(INFOFILES)
-
-ALLFILES= $(AWKSRC) $(GNUSRC) $(COPIES) $(MISC) $(DOCS) $(ALLDOC) $(OTHERS) \
- $(SUPPORT)
-
-# Release of gawk. There can be no leading or trailing white space here!
-REL=2.13
-
-# rules to build gawk
-gawk: $(ALLOBJS) $(GNUOBJS) $(REOBJS)
- $(CC) -o gawk $(CFLAGS) $(ALLOBJS) $(GNUOBJS) $(REOBJS) -lm $(LIBS)
-
-$(AWKOBJS): awk.h config.h
-
-dfa.o: awk.h config.h dfa.h
-
-regex.o: awk.h config.h regex.h
-
-main.o: patchlevel.h
-
-awk.tab.o: awk.h awk.tab.c
-
-awk.tab.c: awk.y
- $(PARSER) -v awk.y
- sed '/^extern char .malloc(), .realloc();$$/d' y.tab.c >awk.tab.c
- rm y.tab.c
-
-config.h: config.h-dist
- @echo You must provide a config.h!
- @echo Run \"./configure\" to build it for known systems
- @echo or copy config.h-dist to config.h and edit it.; exit 1
-
-install: gawk
- install -s gawk $(DESTDIR)$(BINDIR)
- install -c gawk.1 $(DESTDIR)$(MANDIR)/gawk.$(MANEXT)
-
-# ALLOCA: uncomment this if your system (notably System V boxen)
-# does not have alloca in /lib/libc.a or /lib/libPW.a
-#
-# If your machine is not supported by the assembly version of alloca.s,
-# use the C version which follows instead. It uses the default rules to
-# make alloca.o.
-#
-# One of these rules should have already been selected by running configure.
-
-
-##MAKE_ALLOCA_S## alloca.o: alloca.s
-##MAKE_ALLOCA_S## /lib/cpp < alloca.s | sed '/^#/d' > t.s
-##MAKE_ALLOCA_S## as t.s -o alloca.o
-##MAKE_ALLOCA_S## rm t.s
-
-##MAKE_ALLOCA_C## alloca.o: alloca.c
-
-# auxiliary rules for release maintenance
-lint: $(ALLSRC)
- lint -hcbax $(FLAGS) $(ALLSRC)
-
-xref:
- cxref -c $(FLAGS) $(ALLSRC) | grep -v ' /' >xref
-
-clean:
- rm -f *.o core awk.output gmon.out make.out y.output
-
-cleaner: clean
- rm -f gawk awk.tab.c
-
-clobber: clean
- rm -f $(ALLDOC) gawk.log
-
-gawk.dvi: gawk.texinfo
- tex gawk.texinfo ; texindex gawk.??
- tex gawk.texinfo ; texindex gawk.??
- tex gawk.texinfo
-
-$(INFOFILES): gawk.texinfo
- makeinfo gawk.texinfo
-
-gawk-test-$(REL).tar.Z::
- -rm -f gawk-test-$(REL).tar.Z
- tar -cf - test | compress >gawk-test-$(REL).tar.Z
-
-dist: $(AWKSRC) $(GNUSRC) $(DOCS) $(MISC) $(COPIES) $(SUPPORT)
- configure msdos
- mv config.h pc
- -rm -rf gawk-$(REL) gawk-$(REL).*.tar.Z
- -mkdir gawk-$(REL)
- cp -p $(AWKSRC) $(GNUSRC) $(DOCS) $(MISC) gawk-$(REL)
- -mkdir gawk-$(REL)/missing
- cp -p $(COPIES) gawk-$(REL)/missing
- -mkdir gawk-$(REL)/atari
- cp -p atari/* gawk-$(REL)/atari
- -mkdir gawk-$(REL)/pc
- cp -p pc/* gawk-$(REL)/pc
- -mkdir gawk-$(REL)/vms
- cp -p vms/* gawk-$(REL)/vms
- -mkdir gawk-$(REL)/config
- cp -p config/* gawk-$(REL)/config
- -mkdir gawk-$(REL)/support
- cp -p support/* gawk-$(REL)/support
- ln -s ../test gawk-$(REL)
- tar -cfh - gawk-$(REL) | compress > gawk-$(REL).`gawk '{print $$3}' patchlevel.h`.tar.Z
-
-gawk-doc-$(REL).tar.Z: $(ALLDOC)
- -rm -rf gawk-doc-$(REL) gawk-doc-$(REL).tar.Z
- -mkdir gawk-doc-$(REL)
- cp -p $(INFOFILES) gawk.dvi gawk-doc-$(REL)
- nroff -man gawk.1 > gawk-doc-$(REL)/gawk.1.pr
- tar -cf - gawk-$(REL)-doc | compress > gawk-doc-$(REL).tar.Z
-
-gawk-ps-$(REL).tar.Z: gawk-ps-$(REL).tar.Z
- -rm -rf gawk-ps-$(REL) gawk-ps-$(REL).tar.Z
- -mkdir gawk-ps-$(REL)
- dvips -o !cat gawk.dvi > gawk-ps-$(REL)/gawk.postscript
- pstroff -man gawk.1 > gawk-ps-$(REL)/gawk.1.ps
- tar -cf - gawk-ps-$(REL) | compress > gawk-ps-$(REL).tar.Z
-
-release: gawk-src-$(REL).tar.Z gawk-doc-$(REL).tar.Z gawk-ps-$(REL).tar.Z \
- gawk-test-$(REL).tar.Z
-
-diff:
- for i in RCS/*; do rcsdiff -c -b $$i > `basename $$i ,v`.diff; done
-
-test::
- make gawk
- cd test; make -k
-
-bigtest::
- make gawk
- cd test; make -k bigtest
diff --git a/Makefile b/Makefile.in
index 2a7d6ab6..19f008ec 100644
--- a/Makefile
+++ b/Makefile.in
@@ -1,14 +1,14 @@
# Makefile for GNU Awk.
#
-# Copyright (C) 1986, 1988, 1989 the Free Software Foundation, Inc.
+# Copyright (C) 1986, 1988-1991 the Free Software Foundation, Inc.
#
# This file is part of GAWK, the GNU implementation of the
# AWK Progamming Language.
#
# GAWK is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 1, or (at your option)
-# any later version.
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
#
# GAWK is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -17,9 +17,9 @@
#
# You should have received a copy of the GNU General Public License
# along with GAWK; see the file COPYING. If not, write to
-# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-# User tunable macros -- CHANGE THESE IN Makefile-dist RATHER THAN IN
+# User tunable macros -- CHANGE THESE IN Makefile.in RATHER THAN IN
# Makefile, OR configure WILL OVERWRITE YOUR CHANGES
DESTDIR=
@@ -34,10 +34,10 @@ MANEXT= l
# -pg include new (gmon) profiling info
#
# The provided "configure" is used to turn a config file (samples in
-# the "config" directory into commands to edit config.h-dist into
-# a suitable config.h and to edit Makefile-dist into Makefile.
+# the "config" directory into commands to edit config.h.in into
+# a suitable config.h and to edit Makefile.in into Makefile.
# To port GAWK, create an appropriate config file using the ones in
-# the config directory as examples and using the comments in config.h-dist
+# the config directory as examples and using the comments in config.h.in
# as a guide.
#
@@ -100,7 +100,7 @@ SRC = main.c eval.c builtin.c msg.c version.c \
ALLSRC= $(SRC) awk.tab.c
-AWKSRC= awk.h awk.y $(ALLSRC) patchlevel.h protos.h config.h-dist
+AWKSRC= awk.h awk.y $(ALLSRC) patchlevel.h protos.h config.h.in
GNUSRC = alloca.c alloca.s dfa.c dfa.h regex.c regex.h
@@ -119,7 +119,7 @@ INFOFILES= gawk-info gawk-info-1 gawk-info-2 gawk-info-3 gawk-info-4 \
gawk.fns gawk.ky gawk.kys gawk.pg gawk.pgs gawk.toc \
gawk.tp gawk.tps gawk.vr gawk.vrs
-MISC = CHANGES COPYING FUTURES Makefile-dist PROBLEMS README* PORTS mkconf \
+MISC = CHANGES COPYING FUTURES Makefile.in PROBLEMS README* PORTS mkconf \
mungeconf configure ACKNOWLEDGMENT LIMITATIONS
OTHERS= pc/* atari/* vms/*
@@ -140,7 +140,8 @@ $(AWKOBJS): awk.h config.h
dfa.o: awk.h config.h dfa.h
-regex.o: awk.h config.h regex.h
+regex.o: config.h regex.h
+ $(CC) $(CFLAGS) -DREGEX_MALLOC -DGAWK -c regex.c
main.o: patchlevel.h
@@ -151,10 +152,10 @@ awk.tab.c: awk.y
sed '/^extern char .malloc(), .realloc();$$/d' y.tab.c >awk.tab.c
rm y.tab.c
-config.h: config.h-dist
+config.h: config.h.in
@echo You must provide a config.h!
@echo Run \"./configure\" to build it for known systems
- @echo or copy config.h-dist to config.h and edit it.; exit 1
+ @echo or copy config.h.in to config.h and edit it.; exit 1
install: gawk
install -s gawk $(DESTDIR)$(BINDIR)
diff --git a/PROBLEMS b/PROBLEMS
index f36aa501..de1cf33f 100644
--- a/PROBLEMS
+++ b/PROBLEMS
@@ -3,6 +3,4 @@ Hopefully they will all be fixed in the next major release of gawk.
Please keep in mind that the code is still undergoing significant evolution.
-1. Gawk's error messages are not in GNU standard format (not emacs parseable).
-
-2. Gawk's printf is probably still not POSIX compliant.
+1. Gawk's printf is probably still not POSIX compliant.
diff --git a/README b/README
index c3bfc0dd..8277acc2 100644
--- a/README
+++ b/README
@@ -1,6 +1,3 @@
-**** NOTE: The texinfo manual is being edited and will be restored
-**** to the distribution shortly after the release.
-
README:
This is GNU Awk 2.13. It should be upwardly compatible with the
@@ -15,10 +12,13 @@ Known problems are given in the PROBLEMS file. Work to be done is
described briefly in the FUTURES file. Verified ports are listed in
the PORTS file. Please read the LIMITATIONS and ACKNOWLEDGMENT files.
-The gawk.texinfo included in this release is out of date with respect to
-the code. An updated manual will be released soon after the code.
-It can be changed into an info file (included) with
-makeinfo or with texinfo-format-buffer in emacs without changes.
+To format the documentation, with TeX, you must use texinfo.tex 2.53
+or later. Otherwise footnotes look unacceptable.
+
+If you wish to use remake the Info files, you should use makeinfo. We
+used makeinfo 2.10. Note that this version of makeinfo requires a patch,
+which is supplied in support/makeinfo.patch. We don't know if this patch
+will make it into the next release of makeinfo or not.
The man page is up to date.
If you do not have nroff or troff, you can use `awf' included in
@@ -26,7 +26,7 @@ the test suite to format the manual page with only a few small problems.
Summary of Changes from 2.11.1
-Configuration is via a config file which is used by the "mkconf" script
+Configuration is via a config file which is used by the "configure" script
to create Makefile and config.h. Sample configuration files for various systems
are included in the config directory.
@@ -93,7 +93,7 @@ with 'make bigtest'. There are many interesting programs in the test suite!
PRINTING THE MANUAL
-The 'support' directory contains texinfo.tex 2.40, which will be necessary
+The 'support' directory contains texinfo.tex 2.53, which will be necessary
for printing the manual, and the texindex.c program from the emacs distribution
which is also necessary. See the makefile for the steps needed to get a
DVI file from the manual.
diff --git a/array.c b/array.c
index c6b1a0d6..de965ff6 100644
--- a/array.c
+++ b/array.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
diff --git a/atari/Makefile.st b/atari/Makefile.st
index 36bf0ae3..b1ca4988 100644
--- a/atari/Makefile.st
+++ b/atari/Makefile.st
@@ -18,8 +18,8 @@
#
# GAWK is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 1, or (at your option)
-# any later version.
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
#
# GAWK is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -28,7 +28,7 @@
#
# You should have received a copy of the GNU General Public License
# along with GAWK; see the file COPYING. If not, write to
-# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# User tunable macros
diff --git a/awk.h b/awk.h
index 778e4765..08cc42ab 100644
--- a/awk.h
+++ b/awk.h
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* ------------------------------ Includes ------------------------------ */
@@ -325,7 +325,7 @@ typedef struct exp_node {
* the structure on many machines
*/
char *sp;
- short slen;
+ size_t slen;
unsigned char sref;
char idx;
} val;
@@ -424,7 +424,6 @@ typedef struct iobuf {
char *end;
size_t size; /* this will be determined by an fstat() call */
int cnt;
- char *secbuf;
size_t secsiz;
int flag;
# define IOP_IS_TTY 1
diff --git a/awk.tab.c b/awk.tab.c
index c61d2553..f19b16ca 100644
--- a/awk.tab.c
+++ b/awk.tab.c
@@ -103,7 +103,7 @@ extern int yyerrflag;
YYSTYPE yylval, yyval;
# define YYERRCODE 256
-# line 709 "awk.y"
+# line 717 "awk.y"
struct token {
@@ -182,44 +182,46 @@ va_dcl
{
va_list args;
char *mesg;
- register char *ptr, *beg;
+ register char *bp, *cp;
char *scan;
+ char buf[120];
errcount++;
/* Find the current line in the input file */
if (lexptr) {
if (!thisline) {
- for (beg = lexeme; beg != lexptr_begin && *beg != '\n'; --beg)
+ for (cp=lexeme; cp != lexptr_begin && *cp != '\n'; --cp)
;
- if (*beg == '\n')
- beg++;
- thisline = beg;
+ if (*cp == '\n')
+ cp++;
+ thisline = cp;
}
/* NL isn't guaranteed */
- ptr = lexeme;
- while (ptr < lexend && *ptr && *ptr != '\n')
- ptr++;
+ bp = lexeme;
+ while (bp < lexend && *bp && *bp != '\n')
+ bp++;
} else {
thisline = "(END OF FILE)";
- ptr = thisline + 13;
+ bp = thisline + 13;
}
- msg("syntax error");
- fprintf(stderr, "%.*s\n", (int) (ptr - thisline), thisline);
+ msg("%.*s", (int) (bp - thisline), thisline);
+ bp = buf;
+ cp = buf + sizeof(buf) - 24; /* 24 more than longest msg. input */
if (lexptr) {
scan = thisline;
- while (scan < lexeme)
+ while (bp < cp && scan < lexeme)
if (*scan++ == '\t')
- putc('\t', stderr);
+ *bp++ = '\t';
else
- putc(' ', stderr);
- putc('^', stderr);
- putc(' ', stderr);
+ *bp++ = ' ';
+ *bp++ = '^';
+ *bp++ = ' ';
}
va_start(args);
mesg = va_arg(args, char *);
- vfprintf(stderr, mesg, args);
+ (void) vsprintf(bp, mesg, args);
va_end(args);
- putc('\n', stderr);
+ msg(buf);
exit(2);
}
@@ -2726,26 +2728,34 @@ case 133:
{ yyval.nodeval = snode (yypvt[-1].nodeval, Node_builtin, (int) yypvt[-3].lval); } break;
case 134:
# line 628 "awk.y"
-{ yyval.nodeval = snode ((NODE *)NULL, Node_builtin, (int) yypvt[-0].lval); } break;
+{
+ if (do_lint)
+ warning("call of length without parentheses is not portable");
+ yyval.nodeval = snode ((NODE *)NULL, Node_builtin, (int) yypvt[-0].lval);
+ if (do_posix) {
+ yyerror("POSIX requires parentheses for call to `length'");
+ yyerrok;
+ }
+ } break;
case 135:
-# line 630 "awk.y"
+# line 638 "awk.y"
{
yyval.nodeval = node (yypvt[-1].nodeval, Node_func_call, make_string(yypvt[-3].sval, strlen(yypvt[-3].sval)));
} break;
case 136:
-# line 634 "awk.y"
+# line 642 "awk.y"
{ yyval.nodeval = node (yypvt[-0].nodeval, Node_preincrement, (NODE *)NULL); } break;
case 137:
-# line 636 "awk.y"
+# line 644 "awk.y"
{ yyval.nodeval = node (yypvt[-0].nodeval, Node_predecrement, (NODE *)NULL); } break;
case 138:
-# line 638 "awk.y"
+# line 646 "awk.y"
{ yyval.nodeval = yypvt[-0].nodeval; } break;
case 139:
-# line 640 "awk.y"
+# line 648 "awk.y"
{ yyval.nodeval = yypvt[-0].nodeval; } break;
case 140:
-# line 643 "awk.y"
+# line 651 "awk.y"
{ if (yypvt[-0].nodeval->type == Node_val) {
yypvt[-0].nodeval->numbr = -(force_number(yypvt[-0].nodeval));
yyval.nodeval = yypvt[-0].nodeval;
@@ -2753,25 +2763,25 @@ case 140:
yyval.nodeval = node (yypvt[-0].nodeval, Node_unary_minus, (NODE *)NULL);
} break;
case 141:
-# line 650 "awk.y"
+# line 658 "awk.y"
{ yyval.nodeval = yypvt[-0].nodeval; } break;
case 142:
-# line 655 "awk.y"
+# line 663 "awk.y"
{ yyval.nodeval = node (yypvt[-1].nodeval, Node_postincrement, (NODE *)NULL); } break;
case 143:
-# line 657 "awk.y"
+# line 665 "awk.y"
{ yyval.nodeval = node (yypvt[-1].nodeval, Node_postdecrement, (NODE *)NULL); } break;
case 145:
-# line 663 "awk.y"
+# line 671 "awk.y"
{ yyval.nodeval = NULL; } break;
case 146:
-# line 665 "awk.y"
+# line 673 "awk.y"
{ yyval.nodeval = yypvt[-0].nodeval; } break;
case 147:
-# line 670 "awk.y"
+# line 678 "awk.y"
{ yyval.nodeval = variable(yypvt[-0].sval,1); } break;
case 148:
-# line 672 "awk.y"
+# line 680 "awk.y"
{
if (yypvt[-1].nodeval->rnode == NULL) {
yyval.nodeval = node (variable(yypvt[-3].sval,1), Node_subscript, yypvt[-1].nodeval->lnode);
@@ -2780,22 +2790,22 @@ case 148:
yyval.nodeval = node (variable(yypvt[-3].sval,1), Node_subscript, yypvt[-1].nodeval);
} break;
case 149:
-# line 680 "awk.y"
+# line 688 "awk.y"
{ yyval.nodeval = node (yypvt[-0].nodeval, Node_field_spec, (NODE *)NULL); } break;
case 150:
-# line 682 "awk.y"
+# line 690 "awk.y"
{ yyval.nodeval = node (yypvt[-0].nodeval, Node_field_spec, (NODE *)NULL); } break;
case 152:
-# line 690 "awk.y"
+# line 698 "awk.y"
{ yyerrok; } break;
case 153:
-# line 694 "awk.y"
+# line 702 "awk.y"
{ yyerrok; } break;
case 156:
-# line 703 "awk.y"
+# line 711 "awk.y"
{ yyerrok; want_assign = 0; } break;
case 157:
-# line 706 "awk.y"
+# line 714 "awk.y"
{ yyerrok; } break;
}
goto yystack; /* reset registers in driver code */
diff --git a/awk.y b/awk.y
index 37fa96f5..d2d2dbc5 100644
--- a/awk.y
+++ b/awk.y
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
%{
@@ -625,7 +625,15 @@ non_post_simp_exp
| LEX_LENGTH '(' opt_expression_list r_paren
{ $$ = snode ($3, Node_builtin, (int) $1); }
| LEX_LENGTH
- { $$ = snode ((NODE *)NULL, Node_builtin, (int) $1); }
+ {
+ if (do_lint)
+ warning("call of length without parentheses is not portable");
+ $$ = snode ((NODE *)NULL, Node_builtin, (int) $1);
+ if (do_posix) {
+ yyerror("POSIX requires parentheses for call to `length'");
+ yyerrok;
+ }
+ }
| FUNC_CALL '(' opt_expression_list r_paren
{
$$ = node ($3, Node_func_call, make_string($1, strlen($1)));
@@ -784,44 +792,46 @@ va_dcl
{
va_list args;
char *mesg;
- register char *ptr, *beg;
+ register char *bp, *cp;
char *scan;
+ char buf[120];
errcount++;
/* Find the current line in the input file */
if (lexptr) {
if (!thisline) {
- for (beg = lexeme; beg != lexptr_begin && *beg != '\n'; --beg)
+ for (cp=lexeme; cp != lexptr_begin && *cp != '\n'; --cp)
;
- if (*beg == '\n')
- beg++;
- thisline = beg;
+ if (*cp == '\n')
+ cp++;
+ thisline = cp;
}
/* NL isn't guaranteed */
- ptr = lexeme;
- while (ptr < lexend && *ptr && *ptr != '\n')
- ptr++;
+ bp = lexeme;
+ while (bp < lexend && *bp && *bp != '\n')
+ bp++;
} else {
thisline = "(END OF FILE)";
- ptr = thisline + 13;
+ bp = thisline + 13;
}
- msg("syntax error");
- fprintf(stderr, "%.*s\n", (int) (ptr - thisline), thisline);
+ msg("%.*s", (int) (bp - thisline), thisline);
+ bp = buf;
+ cp = buf + sizeof(buf) - 24; /* 24 more than longest msg. input */
if (lexptr) {
scan = thisline;
- while (scan < lexeme)
+ while (bp < cp && scan < lexeme)
if (*scan++ == '\t')
- putc('\t', stderr);
+ *bp++ = '\t';
else
- putc(' ', stderr);
- putc('^', stderr);
- putc(' ', stderr);
+ *bp++ = ' ';
+ *bp++ = '^';
+ *bp++ = ' ';
}
va_start(args);
mesg = va_arg(args, char *);
- vfprintf(stderr, mesg, args);
+ (void) vsprintf(bp, mesg, args);
va_end(args);
- putc('\n', stderr);
+ msg(buf);
exit(2);
}
diff --git a/builtin.c b/builtin.c
index 9465ba1f..c2e28e52 100644
--- a/builtin.c
+++ b/builtin.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -983,19 +983,17 @@ int global;
}
text = t->stptr;
textlen = t->stlen;
+ buflen = textlen + 2;
s = force_string(tree_eval(s));
repl = s->stptr;
replend = repl + s->stlen;
repllen = replend - repl;
if (repllen == 0) { /* replacement is null string */
- buflen = textlen;
buf = text; /* so do subs. in place */
inplace = 1;
- } else {
- buflen = textlen * 2; /* initial guess -- adjusted later */
+ } else
emalloc(buf, char *, buflen, "do_sub");
- }
ampersands = 0;
for (scan = repl; scan < replend; scan++) {
if (*scan == '&') {
diff --git a/config.h b/config.h
deleted file mode 100644
index 72406f83..00000000
--- a/config.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * config.h -- configuration definitions for gawk.
- *
- * Sun running SunOS 4.1
- */
-
-/*
- * Copyright (C) 1991, the Free Software Foundation, Inc.
- *
- * This file is part of GAWK, the GNU implementation of the
- * AWK Progamming Language.
- *
- * GAWK is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
- *
- * GAWK is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * This file isolates configuration dependencies for gnu awk.
- * You should know something about your system, perhaps by having
- * a manual handy, when you edit this file. You should copy config.h-dist
- * to config.h, and edit config.h. Do not modify config.h-dist, so that
- * it will be easy to apply any patches that may be distributed.
- *
- * The general idea is that systems conforming to the various standards
- * should need to do the least amount of changing. Definining the various
- * items in ths file usually means that your system is missing that
- * particular feature.
- *
- * The order of preference in standard conformance is ANSI C, POSIX,
- * and the SVID.
- *
- * If you have no clue as to what's going on with your system, try
- * compiling gawk without editing this file and see what shows up
- * missing in the link stage. From there, you can probably figure out
- * which defines to turn on.
- */
-
-/**************************/
-/* Miscellanious features */
-/**************************/
-
-/*
- * BLKSIZE_MISSING
- *
- * Check your /usr/include/sys/stat.h file. If the stat structure
- * does not have a member named st_blksize, define this. (This will
- * most likely be the case on most System V systems prior to V.4.)
- */
-/* #define BLKSIZE_MISSING 1 */
-
-/*
- * SIGTYPE
- *
- * The return type of the routines passed to the signal function.
- * Modern systems use `void', older systems use `int'.
- * If left undefined, it will default to void.
- */
-/* #define SIGTYPE int */
-
-/*
- * SIZE_T_MISSING
- *
- * If your system has no typedef for size_t, define this to get a default
- */
-/* #define SIZE_T_MISSING 1 */
-
-/*
- * CHAR_UNSIGNED
- *
- * If your machine uses unsigned characters (IBM RT and RS/6000 and others)
- * then define this for use in regex.c
- */
-/* #define CHAR_UNSIGNED 1 */
-
-/*
- * HAVE_UNDERSCORE_SETJMP
- *
- * Check in your /usr/include/setjmp.h file. If there are routines
- * there named _setjmp and _longjmp, then you should define this.
- * Typically only systems derived from Berkeley Unix have this.
- */
-#define HAVE_UNDERSCORE_SETJMP 1
-
-/***********************************************/
-/* Missing library subroutines or system calls */
-/***********************************************/
-
-/*
- * GETOPT_MISSING
- *
- * Define this if your library does not have the getopt(3) library
- * routine for parsing command line arguments.
- */
-/* #define GETOPT_MISSING 1 */
-
-/*
- * MEMCMP_MISSING
- * MEMCPY_MISSING
- * MEMSET_MISSING
- *
- * These three routines are for manipulating blocks of memory. Most
- * likely they will either all three be present or all three be missing,
- * so they're grouped together.
- */
-/* #define MEMCMP_MISSING 1 */
-/* #define MEMCPY_MISSING 1 */
-/* #define MEMSET_MISSING 1 */
-
-/*
- * RANDOM_MISSING
- *
- * Your system does not have the random(3) suite of random number
- * generating routines. These are different than the old rand(3)
- * routines!
- */
-/* #define RANDOM_MISSING 1 */
-
-/*
- * STRCASE_MISSING
- *
- * Your system does not have the strcasemp() and strncasecmp()
- * routines that originated in Berkeley Unix.
- */
-/* #define STRCASE_MISSING 1 */
-
-/*
- * STRCHR_MISSING
- *
- * Your system does not have the strchr() and strrchr() functions.
- */
-/* #define STRCHR_MISSING 1 */
-
-/*
- * STRERROR_MISSING
- *
- * Your system lacks the ANSI C strerror() routine for returning the
- * strings associated with errno values.
- */
-#define STRERROR_MISSING 1
-
-/*
- * STRTOD_MISSING
- *
- * Your system does not have the strtod() routine for converting
- * strings to double precision floating point values.
- */
-/* #define STRTOD_MISSING 1 */
-
-/*
- * STRTOL_MISSING
- *
- * Your system does not have the strtol() routine for converting
- * strings to long integers.
- */
-/* #define STRTOL_MISSING 1 */
-
-/*
- * STRFTIME_MISSING
- *
- * Your system lacks the ANSI C strftime() routine for formatting
- * broken down time values.
- */
-/* #define STRFTIME_MISSING 1 */
-
-/*
- * TZSET_MISSING
- *
- * If you have a 4.2 BSD vintage system, then the strftime() routine
- * supplied in the missing directory won't be enough, because it relies on the
- * tzset() routine from System V / Posix. Fortunately, there is an
- * emulation for tzset() too that should do the trick. If you don't
- * have tzset(), define this.
- */
-/* #define TZSET_MISSING 1 */
-
-/*
- * TZNAME_MISSING
- *
- * Some systems do not support the external variables tzname and daylight.
- * If this is the case *and* strftime() is missing, define this.
- */
-/* #define TZNAME_MISSING 1 */
-
-/*
- * STDC_HEADERS
- *
- * If your system does have ANSI compliant header files that
- * provide prototypes for library routines, then define this.
- */
-/* #define STDC_HEADERS 1 */
-
-/*
- * NO_TOKEN_PASTING
- *
- * If your compiler define's __STDC__ but does not support token
- * pasting (tok##tok), then define this.
- */
-/* #define NO_TOKEN_PASTING 1 */
-
-/*****************************************************************/
-/* Stuff related to the Standard I/O Library. */
-/*****************************************************************/
-/* Much of this is (still, unfortunately) black magic in nature. */
-/* You may have to use some or all of these together to get gawk */
-/* to work correctly. */
-/*****************************************************************/
-
-/*
- * NON_STD_SPRINTF
- *
- * Look in your /usr/include/stdio.h file. If the return type of the
- * sprintf() function is NOT `int', define this.
- */
-#define NON_STD_SPRINTF 1
-
-/*
- * VPRINTF_MISSING
- *
- * Define this if your system lacks vprintf() and the other routines
- * that go with it.
- */
-/* #define VPRINTF_MISSING 1 */
-
-/*
- * BSDSTDIO
- *
- * Define this if your standard i/o library is internally compatible
- * with the one shipped with Berkeley Unix systems (4.n, n <= 3-reno).
- * If you've defined VPRINTF_MISSING, you probably will need this too.
- */
-/* #define BSDSTDIO 1 */
-
-/*
- * DOPRNT_MISSING
- *
- * Define this if your standard i/o library does not have the _doprnt()
- * routine. This is used in an attempt to simulate the vfprintf()
- * routine.
- */
-/* #define DOPRNT_MISSING 1 */
-
-/*
- * Casts from size_t to int and back. These will become unnecessary
- * at some point in the future, but for now are required where the
- * two types are a different representation.
- */
-/* #define SZTC */
-/* #define INTC */
-
-/*
- * SYSTEM_MISSING
- *
- * Define this if your library does not provide a system function
- * or you are not entirely happy with it and would rather use
- * a provided replacement (atari only).
- */
-/* #define SYSTEM_MISSING 1 */
-
-
-/*******************************/
-/* Gawk configuration options. */
-/*******************************/
-
-/*
- * DEFPATH
- *
- * The default search path for the -f option of gawk. It is used
- * if the AWKPATH environment variable is undefined. The default
- * definition is provided here. Most likely you should not change
- * this.
- */
-
-/* #define DEFPATH ".:/usr/lib/awk:/usr/local/lib/awk" */
-/* #define ENVSEP ':' */
-
-/* anything that follows is for system-specific short-term kludges */
diff --git a/config.h-dist b/config.h.in
index d2b63c06..d2b63c06 100644
--- a/config.h-dist
+++ b/config.h.in
diff --git a/configure b/configure
index 2ff6dbff..3a39811c 100755
--- a/configure
+++ b/configure
@@ -11,7 +11,7 @@ case "$#" in
esac
if [ -f config/$1 ]; then
- sh ./mungeconf config/$1 config.h-dist >config.h
+ sh ./mungeconf config/$1 config.h.in >config.h
# echo #echo lines to stdout
sed -n '/^#echo /s///p' config/$1
@@ -19,14 +19,14 @@ if [ -f config/$1 ]; then
sed -n '/^MAKE_.*/s//s,^##&## ,,/p' config/$1 >sedscr
if [ -s sedscr ]
then
- sed -f sedscr Makefile-dist >Makefile
+ sed -f sedscr Makefile.in >Makefile
else
- cp Makefile-dist Makefile
+ cp Makefile.in Makefile
fi
rm -f sedscr
else
echo "\`$1' is not a known configuration."
echo "Either construct one based on the examples in the config directory,"
- echo "or copy config.h-dist to config.h and edit it."
+ echo "or copy config.h.in to config.h and edit it."
exit 1
fi
diff --git a/eval.c b/eval.c
index a467315d..58c39fc0 100644
--- a/eval.c
+++ b/eval.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -417,6 +417,9 @@ register NODE *tree;
lhs = get_lhs(tree, (Func_ptr *)0);
return *lhs;
+ case Node_var_array:
+ fatal("attempt to use an array in a scalar context");
+
case Node_unary_minus:
t1 = tree_eval(tree->subnode);
x = -force_number(t1);
@@ -591,6 +594,9 @@ register NODE *tree;
case Node_minus:
return tmp_number(x1 - x2);
+ case Node_var_array:
+ fatal("attempt to use an array in a scalar context");
+
default:
fatal("illegal type (%d) in tree_eval", tree->type);
}
@@ -709,11 +715,15 @@ register NODE *tree;
lhs = get_lhs(tree->lnode, &after_assign);
lval = force_number(*lhs);
- unref(*lhs);
+ /*
+ * Can't unref *lhs until we know the type; doing so
+ * too early breaks x += x sorts of things.
+ */
switch(tree->type) {
case Node_preincrement:
case Node_predecrement:
+ unref(*lhs);
*lhs = make_number(lval +
(tree->type == Node_preincrement ? 1.0 : -1.0));
if (after_assign)
@@ -722,6 +732,7 @@ register NODE *tree;
case Node_postincrement:
case Node_postdecrement:
+ unref(*lhs);
*lhs = make_number(lval +
(tree->type == Node_postincrement ? 1.0 : -1.0));
if (after_assign)
@@ -734,6 +745,7 @@ register NODE *tree;
tmp = tree_eval(tree->rnode);
rval = force_number(tmp);
free_temp(tmp);
+ unref(*lhs);
switch(tree->type) {
case Node_assign_exp:
if ((ltemp = rval) == rval) { /* integer exponent */
@@ -912,6 +924,7 @@ NODE *arg_list; /* Node_expression_list of calling args. */
arg = stack_ptr[arg->param_cnt];
n = *sp++;
if (arg->type == Node_var && n->type == Node_var_array) {
+ /* should we free arg->var_value ? */
arg->var_array = n->var_array;
arg->type = Node_var_array;
}
@@ -921,6 +934,11 @@ NODE *arg_list; /* Node_expression_list of calling args. */
}
while (count-- > 0) {
n = *sp++;
+ /* if n is an (local) array, all the elements should be freed */
+ if (n->type == Node_var_array) {
+ assoc_clear(n);
+ free(n->var_array);
+ }
unref(n->lnode);
freenode(n);
}
diff --git a/field.c b/field.c
index 10210a50..26e26dac 100644
--- a/field.c
+++ b/field.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -43,7 +43,6 @@ static Regexp *FS_regexp = NULL;
static char *parse_extent; /* marks where to restart parse of record */
static int parse_high_water=0; /* field number that we have parsed so far */
static int nf_high_water = 0; /* size of fields_arr */
-static char f_empty[] = "\0";
static int resave_fs;
static NODE *save_FS;
static char *save_fs; /* save current value of FS when line is read,
@@ -53,6 +52,7 @@ static char *save_fs; /* save current value of FS when line is read,
NODE **fields_arr; /* array of pointers to the field nodes */
int field0_valid = 1; /* $(>0) has not been changed yet */
NODE *field0;
+int default_FS;
static NODE **nodes; /* permanent repository of field nodes */
static int *FIELDWIDTHS = NULL;
@@ -65,8 +65,10 @@ init_fields()
field0->type = Node_val;
field0->stref = 0;
field0->stptr = "";
+ field0->stlen = 0;
field0->flags = (STRING|STR|PERM); /* never free buf */
fields_arr[0] = field0;
+ parse_extent = fields_arr[0]->stptr;
save_FS = dupnode(FS_node->var_value);
save_fs = save_FS->stptr;
}
@@ -98,7 +100,6 @@ int len;
NODE *dummy; /* not used -- just to make interface same as set_element */
{
register NODE *n;
- register int t;
if (num > nf_high_water)
grow_fields_arr(num);
@@ -203,6 +204,8 @@ void
set_NF()
{
NF = (int) force_number(NF_node->var_value);
+ if (NF > nf_high_water)
+ grow_fields_arr(NF);
field0_valid = 0;
}
@@ -225,19 +228,15 @@ NODE *n;
register int nf = parse_high_water;
register char *field;
register char *end = scan + len;
- char *cp;
if (up_to == HUGE)
nf = 0;
if (len == 0)
return nf;
- cp = FS_node->var_value->stptr;
- if (*RS == 0 && *cp == ' ' && *(cp+1) == '\0') {
- while (scan < end
- && (*scan == '\n' || *scan == ' ' || *scan == '\t'))
+ if (*RS == 0 && default_FS)
+ while (scan < end && isspace(*scan))
scan++;
- }
field = scan;
while (scan < end
&& research(rp, scan, (int)(end - scan), 1) != -1
@@ -254,8 +253,10 @@ NODE *n;
(*set)(++nf, field, RESTART(rp, scan), n);
scan += REEND(rp, scan);
field = scan;
+ if (scan == end) /* FS at end of record */
+ (*set)(++nf, field, 0, n);
}
- if (nf != up_to && *RS != 0 && scan < end) {
+ if (nf != up_to && scan < end) {
(*set)(++nf, scan, (int)(end - scan), n);
scan = end;
}
@@ -384,17 +385,17 @@ NODE *n;
}
NODE **
-get_field(num, assign)
-register int num;
+get_field(requested, assign)
+register int requested;
Func_ptr *assign; /* this field is on the LHS of an assign */
{
- int n;
+ int parsed;
/*
* if requesting whole line but some other field has been altered,
* then the whole line must be rebuilt
*/
- if (num == 0) {
+ if (requested == 0) {
if (!field0_valid) {
/* first, parse remainder of input record */
if (NF == -1) {
@@ -412,53 +413,52 @@ Func_ptr *assign; /* this field is on the LHS of an assign */
return &fields_arr[0];
}
- /* assert(num > 0); */
+ /* assert(requested > 0); */
if (assign)
field0_valid = 0;
- if (num <= parse_high_water) /* we have already parsed this field */
- return &fields_arr[num];
+ if (requested <= parse_high_water) /* we have already parsed this field */
+ return &fields_arr[requested];
if (parse_high_water == 0) /* starting at the beginning */
parse_extent = fields_arr[0]->stptr;
/*
- * parse up to num fields, calling set_field() for each, and saving
+ * parse up to requested fields, calling set_field() for each, and saving
* in parse_extent the point where the parse left off
*/
- n = (*parse_field)(num, &parse_extent,
+ parsed = (*parse_field)(requested, &parse_extent,
fields_arr[0]->stlen - (parse_extent-fields_arr[0]->stptr),
save_fs, FS_regexp, set_field, (NODE *)NULL);
- parse_high_water = n;
- if (num == HUGE-1)
- num = n;
- if (n < num) { /* requested field number beyond end of record; */
- register int i;
-
- if (num > nf_high_water)
- grow_fields_arr(num);
-
- /* fill in fields that don't exist */
- for (i = n + 1; i <= num; i++)
- fields_arr[i] = Nnull_string;
-
- /*
- * if this field is onthe LHS of an assignment, then we want to
- * set NF to this value, below
- */
- if (assign)
- n = num;
- }
+ parse_high_water = parsed;
/*
* if we reached the end of the record, set NF to the number of fields
- * so far. Note that num might actually refer to a field that
+ * so far. Note that requested might actually refer to a field that
* is beyond the end of the record, but we won't set NF to that value at
* this point, since this is only a reference to the field and NF
- * only gets set if the field is assigned to -- in this case n has
- * been set to num above
+ * only gets set if the field is assigned to -- in this case parsed has
+ * been set to requested above
*/
if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
- NF = n;
+ NF = parsed;
+ if (requested == HUGE-1)
+ requested = parsed;
+ if (parsed < requested) { /* requested field beyond end of record; */
+ if (assign) { /* expand record */
+ register int i;
+
+ if (requested > nf_high_water)
+ grow_fields_arr(requested);
+
+ /* fill in fields that don't exist */
+ for (i = parsed + 1; i <= requested; i++)
+ fields_arr[i] = Nnull_string;
+
+ NF = requested;
+ parse_high_water = requested;
+ } else
+ return &Nnull_string;
+ }
- return &fields_arr[num];
+ return &fields_arr[requested];
}
static void
@@ -530,6 +530,7 @@ set_FS()
register NODE *tmp;
static char buf[10];
+ default_FS = 0;
if (FS_regexp) {
refree(FS_regexp);
FS_regexp = NULL;
@@ -541,9 +542,10 @@ set_FS()
parse_field = re_parse_field;
FS = buf;
if (tmp->stlen == 1) {
- if (tmp->stptr[0] == ' ')
+ if (tmp->stptr[0] == ' ') {
(void) strcpy(buf, "[ \n]+");
- else if (tmp->stptr[0] != '\n')
+ default_FS = 1;
+ } else if (tmp->stptr[0] != '\n')
sprintf(buf, "[%c\n]", tmp->stptr[0]);
else {
parse_field = sc_parse_field;
@@ -558,6 +560,8 @@ set_FS()
} else {
if (tmp->stlen > 1)
parse_field = re_parse_field;
+ else if (*FS == ' ' && tmp->stlen == 1)
+ default_FS = 1;
else if (*FS != ' ' && tmp->stlen == 1)
parse_field = sc_parse_field;
}
diff --git a/gawk.texinfo b/gawk.texinfo
index fda27951..00a8ada5 100644
--- a/gawk.texinfo
+++ b/gawk.texinfo
@@ -1,6 +1,6 @@
\input texinfo @c -*-texinfo-*-
@c %**start of header (This is for running Texinfo on a region.)
-@setfilename gawk-info
+@setfilename gawk.info
@settitle The GAWK Manual
@c %**end of header (This is for running Texinfo on a region.)
@@ -21,7 +21,7 @@
This file documents @code{awk}, a program that you can use to select
particular records in a file and perform operations upon them.
-Copyright (C) 1989 Free Software Foundation, Inc.
+Copyright (C) 1989,1991 Free Software Foundation, Inc.
Permission is granted to make and distribute verbatim copies of
this manual provided the copyright notice and this permission notice
@@ -52,36 +52,31 @@ by the Foundation.
@setchapternewpage odd
@titlepage
-@sp 11
-@center @titlefont{The GAWK Manual}
-@sp 4
-@center by
-@center Diane Barlow Close
-@center Arnold D. Robbins
-@center Paul H. Rubin
-@center Richard Stallman
-@sp 2
-@center Edition 0.12 Beta
-@sp 2
-@center October 1989
+@title The GAWK Manual
+@subtitle Edition 0.13 Beta
+@subtitle October 1991
+@author Diane Barlow Close
+@author Arnold D. Robbins
+@author Paul H. Rubin
+@author Richard Stallman
@c Include the Distribution inside the titlepage environment so
@c that headings are turned off. Headings on and off do not work.
@page
@vskip 0pt plus 1filll
-Copyright @copyright{} 1989 Free Software Foundation, Inc.
+Copyright @copyright{} 1989, 1991 Free Software Foundation, Inc.
@sp 2
-This is Edition 0.12 Beta of @cite{The GAWK Manual}, @*
-for the 2.11.1 version of the GNU implementation @*
+This is Edition 0.13 Beta of @cite{The GAWK Manual}, @*
+for the 2.13.3 version of the GNU implementation @*
of AWK.
@sp 2
Published by the Free Software Foundation @*
675 Massachusetts Avenue, @*
Cambridge, MA 02139 USA @*
-Printed copies are available for $10 each.
+Printed copies are available for $15 each.
Permission is granted to make and distribute verbatim copies of
this manual provided the copyright notice and this permission notice
@@ -98,12 +93,13 @@ except that this permission notice may be stated in a translation approved
by the Foundation.
@end titlepage
+@ifinfo
@node Top, Preface, (dir), (dir)
@comment node-name, next, previous, up
+@top General Introduction
@c Preface or Licensing nodes should come right after the Top
@c node, in `unnumbered' sections, then the chapter, `What is gawk'.
-@ifinfo
This file documents @code{awk}, a program that you can use to select
particular records in a file and perform operations upon them; it
contains the following chapters:
@@ -152,10 +148,14 @@ contains the following chapters:
* Language History:: The evolution of the @code{awk} language.
+* Installation:: Installing @code{gawk} under various operating systems.
+
* Gawk Summary:: @code{gawk} Options and Language Summary.
* Sample Program:: A sample @code{awk} program with a complete explanation.
+* Bugs:: Reporting Problems and Bugs.
+
* Notes:: Something about the implementation of @code{gawk}.
* Glossary:: An explanation of some unfamiliar terms.
@@ -167,7 +167,9 @@ contains the following chapters:
@comment node-name, next, previous, up
@unnumbered Preface
-@c @cindex what is @code{awk}
+@iftex
+@cindex what is @code{awk}
+@end iftex
If you are like many computer users, you frequently would like to make
changes in various text files wherever certain patterns appear, or
extract data from parts of certain lines while discarding the rest. To
@@ -180,11 +182,12 @@ that makes it possible to handle simple data-reformatting jobs easily
with just a few lines of code.
The GNU implementation of @code{awk} is called @code{gawk}; it is fully
-upward compatible with the System V Release 3.1 and later
-version of @code{awk}. All properly written
-@code{awk} programs should work with @code{gawk}. So we usually don't
-distinguish between @code{gawk} and other @code{awk} implementations in
-this manual.@refill
+upward compatible with the System V Release 4 version of
+@code{awk}. @code{gawk} is also upward compatible with the @sc{POSIX}
+(draft) specification of the @code{awk} language. This means that all
+properly written @code{awk} programs should work with @code{gawk}.
+So we usually don't distinguish between @code{gawk} and other @code{awk}
+implementations in this manual.@refill
@cindex uses of @code{awk}
This manual teaches you what @code{awk} does and how you can use
@@ -208,6 +211,13 @@ even experiment with algorithms that can be adapted later to other computer
languages!
@end itemize
+@iftex
+This manual has the difficult task of being both tutorial and reference.
+If you are a novice, feel free to skip over details that seem too complex.
+You should also ignore the many cross references; they are for the
+expert user, and for the on-line Info version of the manual.
+@end iftex
+
@menu
* History:: The history of @code{gawk} and @code{awk}. Acknowledgements.
@end menu
@@ -225,69 +235,79 @@ language more powerful, introducing user-defined functions, multiple input
streams, and computed regular expressions.
This new version became generally available with System V Release 3.1.
The version in System V Release 4 added some new features and also cleaned
-up the behaviour in some of the ``dark corners'' of the language.@refill
-@comment We don't refer people to non-free information
-@comment In 1988, the original authors
-@comment published @cite{The AWK Programming Language} (Addison-Wesley, ISBN
-@comment 0-201-07981-X), as a definitive description of the @code{awk} language.
+up the behavior in some of the ``dark corners'' of the language.
+The specification for @code{awk} in the @sc{POSIX} Command Language
+and Utilities standard further clarified the language based on feedback
+from both the @code{gawk} designers, and the original @code{awk}
+designers.@refill
The GNU implementation, @code{gawk}, was written in 1986 by Paul Rubin
and Jay Fenlason, with advice from Richard Stallman. John Woods
contributed parts of the code as well. In 1988 and 1989, David Trueman, with
help from Arnold Robbins, thoroughly reworked @code{gawk} for compatibility
-with the newer @code{awk}.
+with the newer @code{awk}. Current development focuses on bug fixes,
+performance improvements, and standards compliance.
Many people need to be thanked for their assistance in producing this
manual. Jay Fenlason contributed many ideas and sample programs. Richard
-Mlynarik and Robert Chassell gave helpful comments on drafts of this
+Mlynarik and Robert Chassell gave helpful comments on early drafts of this
manual. The paper @cite{A Supplemental Document for @code{awk}} by John W.
Pierce of the Chemistry Department at UC San Diego, pinpointed several
issues relevant both to @code{awk} implementation and to this manual, that
-would otherwise have escaped us.
+would otherwise have escaped us. David Trueman, Pat Rankin, and Michal
+Jaegermann also contributed portions of the manual.@refill
+
+The following people provided many helpful comments on this edition of
+the manual: Rick Adams, Michael Brennan, Rich Burridge, Diane Close,
+Christopher (``Topher'') Eliot, Michael Lijewski, Pat Rankin, Miriam Robbins,
+and Michal Tomczak.
Finally, we would like to thank Brian Kernighan of Bell Labs for invaluable
assistance during the testing and debugging of @code{gawk}, and for
-help in clarifying several points about the language.@refill
+help in clarifying numerous points about the language.@refill
@node Copying, This Manual, Preface, Top
-@unnumbered GNU General Public License
-@center Version 1, February 1989
+@unnumbered GNU GENERAL PUBLIC LICENSE
+@center Version 2, June 1991
@display
-Copyright @copyright{} 1989 Free Software Foundation, Inc.
+Copyright @copyright{} 1989, 1991 Free Software Foundation, Inc.
675 Mass Ave, Cambridge, MA 02139, USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
@end display
-@c fakenode - this is for prepinfo.
+@c fakenode --- for prepinfo
@unnumberedsec Preamble
- The license agreements of most software companies try to keep users
-at the mercy of those companies. By contrast, our General Public
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
-software---to make sure the software is free for all its users. The
-General Public License applies to the Free Software Foundation's
-software and to any other program whose authors commit to using it.
-You can use it for your programs, too.
+software---to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
When we speak of free software, we are referring to freedom, not
-price. Specifically, the General Public License is designed to make
-sure that you have the freedom to give away or sell copies of free
-software, that you receive source code or can get it if you want it,
-that you can change the software or use pieces of it in new free
-programs; and that you know you can do these things.
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
- For example, if you distribute copies of a such a program, whether
+ For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
-source code. And you must tell them their rights.
+source code. And you must show them these terms so they know their
+rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
@@ -300,129 +320,217 @@ want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
The precise terms and conditions for copying, distribution and
modification follow.
@iftex
-@c fakenode -- this is for prepinfo.
-@unnumberedsec TERMS AND CONDITIONS
+@c fakenode --- for prepinfo
+@unnumberedsec TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
@end iftex
@ifinfo
-@center TERMS AND CONDITIONS
+@center TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
@end ifinfo
@enumerate
@item
-This License Agreement applies to any program or other work which
-contains a notice placed by the copyright holder saying it may be
-distributed under the terms of this General Public License. The
-``Program'', below, refers to any such program or work, and a ``work based
-on the Program'' means either the Program or any work containing the
-Program or a portion of it, either verbatim or with modifications. Each
-licensee is addressed as ``you''.
+This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The ``Program'', below,
+refers to any such program or work, and a ``work based on the Program''
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term ``modification''.) Each licensee is addressed as ``you''.
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
@item
-You may copy and distribute verbatim copies of the Program's source
-code as you receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice and
-disclaimer of warranty; keep intact all the notices that refer to this
-General Public License and to the absence of any warranty; and give any
-other recipients of the Program a copy of this General Public License
-along with the Program. You may charge a fee for the physical act of
-transferring a copy.
+You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
-@item
-You may modify your copy or copies of the Program or any portion of
-it, and copy and distribute such modifications under the terms of Paragraph
-1 above, provided that you also do the following:
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
-@itemize @bullet
@item
-cause the modified files to carry prominent notices stating that
-you changed the files and the date of any change; and
+You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+@enumerate A
@item
-cause the whole of any work that you distribute or publish, that
-in whole or in part contains the Program or any part thereof, either
-with or without modifications, to be licensed at no charge to all
-third parties under the terms of this General Public License (except
-that you may choose to grant warranty protection to some or all
-third parties, at your option).
+You must cause the modified files to carry prominent notices
+stating that you changed the files and the date of any change.
@item
-If the modified program normally reads commands interactively when
-run, you must cause it, when started running for such interactive use
-in the simplest and most usual way, to print or display an
-announcement including an appropriate copyright notice and a notice
-that there is no warranty (or else, saying that you provide a
-warranty) and that users may redistribute the program under these
-conditions, and telling the user how to view a copy of this General
-Public License.
+You must cause any work that you distribute or publish, that in
+whole or in part contains or is derived from the Program or any
+part thereof, to be licensed as a whole at no charge to all third
+parties under the terms of this License.
@item
-You may charge a fee for the physical act of transferring a
-copy, and you may at your option offer warranty protection in
-exchange for a fee.
-@end itemize
+If the modified program normally reads commands interactively
+when run, you must cause it, when started running for such
+interactive use in the most ordinary way, to print or display an
+announcement including an appropriate copyright notice and a
+notice that there is no warranty (or else, saying that you provide
+a warranty) and that users may redistribute the program under
+these conditions, and telling the user how to view a copy of this
+License. (Exception: if the Program itself is interactive but
+does not normally print such an announcement, your work based on
+the Program is not required to print an announcement.)
+@end enumerate
-Mere aggregation of another independent work with the Program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other work under the scope of these terms.
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
@item
-You may copy and distribute the Program (or a portion or derivative of
-it, under Paragraph 2) in object code or executable form under the terms of
-Paragraphs 1 and 2 above provided that you also do one of the following:
+You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
-@itemize @bullet
+@enumerate A
@item
-accompany it with the complete corresponding machine-readable
-source code, which must be distributed under the terms of
-Paragraphs 1 and 2 above; or,
+Accompany it with the complete corresponding machine-readable
+source code, which must be distributed under the terms of Sections
+1 and 2 above on a medium customarily used for software interchange; or,
@item
-accompany it with a written offer, valid for at least three
-years, to give any third party free (except for a nominal charge
-for the cost of distribution) a complete machine-readable copy of the
-corresponding source code, to be distributed under the terms of
-Paragraphs 1 and 2 above; or,
+Accompany it with a written offer, valid for at least three
+years, to give any third party, for a charge no more than your
+cost of physically performing source distribution, a complete
+machine-readable copy of the corresponding source code, to be
+distributed under the terms of Sections 1 and 2 above on a medium
+customarily used for software interchange; or,
@item
-accompany it with the information you received as to where the
-corresponding source code may be obtained. (This alternative is
+Accompany it with the information you received as to the offer
+to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
-received the program in object code or executable form alone.)
-@end itemize
+received the program in object code or executable form with such
+an offer, in accord with Subsection b above.)
+@end enumerate
-Source code for a work means the preferred form of the work for making
-modifications to it. For an executable file, complete source code means
-all the source code for all modules it contains; but, as a special
-exception, it need not include source code for modules which are standard
-libraries that accompany the operating system on which the executable
-file runs, or for standard header files or definitions files that
-accompany that operating system.
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
@item
-You may not copy, modify, sublicense, distribute or transfer the
-Program except as expressly provided under this General Public License.
-Any attempt otherwise to copy, modify, sublicense, distribute or transfer
-the Program is void, and will automatically terminate your rights to use
-the Program under this License. However, parties who have received
-copies, or rights to use copies, from you under this General Public
-License will not have their licenses terminated so long as such parties
-remain in full compliance.
+You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
@item
-By copying, distributing or modifying the Program (or any work based
-on the Program) you indicate your acceptance of this license to do so,
-and all its terms and conditions.
+You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
@item
Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the original
-licensor to copy, distribute or modify the Program subject to these
-terms and conditions. You may not impose any further restrictions on the
-recipients' exercise of the rights granted herein.
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+@item
+If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+@item
+If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
@item
The Free Software Foundation may publish revised and/or new versions
@@ -431,11 +539,11 @@ be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
-specifies a version number of the license which applies to it and ``any
+specifies a version number of this License which applies to it and ``any
later version'', you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
-the license, you may choose any version ever published by the Free Software
+this License, you may choose any version ever published by the Free Software
Foundation.
@item
@@ -448,7 +556,7 @@ of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
@iftex
-@c fakenode -- this is for prepinfo.
+@c fakenode --- for prepinfo
@heading NO WARRANTY
@end iftex
@ifinfo
@@ -467,19 +575,19 @@ PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
@item
-IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
-ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
-ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT
-LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES
-SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
-WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
@end enumerate
@iftex
-@c fakenode -- this is for prepinfo.
+@c fakenode --- for prepinfo
@heading END OF TERMS AND CONDITIONS
@end iftex
@ifinfo
@@ -487,18 +595,17 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
@end ifinfo
@page
-@c fakenode -- this is for prepinfo.
-@unnumberedsec Appendix: Using These Terms in New Programs
+@c fakenode --- for prepinfo
+@unnumberedsec How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
-possible use to humanity, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these
-terms.
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
- To do so, attach the following notices to the program. It is safest to
-attach them to the start of each source file to most effectively convey
-the exclusion of warranty; and each file should have at least the
-``copyright'' line and a pointer to where the full notice is found.
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the ``copyright'' line and a pointer to where the full notice is found.
@smallexample
@var{one line to give the program's name and a brief idea of what it does.}
@@ -506,8 +613,8 @@ Copyright (C) 19@var{yy} @var{name of author}
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 1, or (at your option)
-any later version.
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -531,26 +638,29 @@ This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
@end smallexample
-The hypothetical commands `show w' and `show c' should show the
-appropriate parts of the General Public License. Of course, the
-commands you use may be called something other than `show w' and `show
-c'; they could even be mouse-clicks or menu items---whatever suits your
-program.
+The hypothetical commands @samp{show w} and @samp{show c} should show
+the appropriate parts of the General Public License. Of course, the
+commands you use may be called something other than @samp{show w} and
+@samp{show c}; they could even be mouse-clicks or menu items---whatever
+suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a ``copyright disclaimer'' for the program, if
-necessary. Here a sample; alter the names:
+necessary. Here is a sample; alter the names:
@example
-Yoyodyne, Inc., hereby disclaims all copyright interest in the
-program `Gnomovision' (a program to direct compilers to make passes
-at assemblers) written by James Hacker.
+Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+`Gnomovision' (which makes passes at compilers) written by James Hacker.
@var{signature of Ty Coon}, 1 April 1989
Ty Coon, President of Vice
@end example
-That's all there is to it!
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
@node This Manual, Getting Started, Copying, Top
@chapter Using This Manual
@@ -561,12 +671,17 @@ That's all there is to it!
@cindex @code{awk} language
@cindex @code{awk} program
-The term @code{gawk} refers to a particular program (a version of
-@code{awk}, developed as part the GNU project), and to the language you
-use to tell this program what to do. When we need to be careful, we
-call the program ``the @code{awk} utility'' and the language ``the
-@code{awk} language''. The purpose of this manual is to explain the
-@code{awk} language and how to run the @code{awk} utility.
+The term @code{awk} refers to a particular program, and to the language you
+use to tell this program what to do. When we need to be careful, we call
+the program ``the @code{awk} utility'' and the language ``the @code{awk}
+language.'' The term @code{gawk} refers to a version of @code{awk}, developed
+as part the GNU project. The purpose of this manual is to explain the
+@code{awk} language and how to run the @code{awk} utility.@refill
+
+While concentrating on the features of @code{gawk}, the manual will also
+attempt to describe important differences between @code{gawk} and other
+@code{awk} implementations. In particular, any features that are not
+in the @sc{POSIX} standard for @code{awk} will be noted. @refill
The term @dfn{@code{awk} program} refers to a program written by you in
the @code{awk} programming language.@refill
@@ -579,8 +694,8 @@ Some useful ``one-liners'' are included to give you a feel for the
@ignore
@strong{I deleted four paragraphs here because they would confuse the
-beginner more than help him. They mention terms such as ``field'',
-``pattern'', ``action'', ``built-in function'' which the beginner
+beginner more than help him. They mention terms such as ``field,''
+``pattern,'' ``action,'' ``built-in function'' which the beginner
doesn't know.}
@strong{If you can find a way to introduce several of these concepts here,
@@ -591,12 +706,16 @@ space than ought to be used here. There may be no way to win.}
@strong{ADR: I'd like to tackle this in phase 2 of my editing.}
@end ignore
-A sizable sample @code{awk} program has been provided for you (@pxref{Sample
-Program}).@refill
+A sample @code{awk} program has been provided for you
+(@pxref{Sample Program}).@refill
If you find terms that you aren't familiar with, try looking them
up in the glossary (@pxref{Glossary}).@refill
+The entire @code{awk} language is summarized for quick reference in
+@ref{Gawk Summary}. Look there if you just need to refresh your memory
+about a particular feature.@refill
+
Most of the time complete @code{awk} programs are used as examples, but in
some of the more advanced sections, only the part of the @code{awk} program
that illustrates the concept being described is shown.@refill
@@ -616,7 +735,7 @@ This chapter contains the following sections:
@cindex @file{BBS-list} file
Many of the examples in this manual take their input from two sample
data files. The first, called @file{BBS-list}, represents a list of
-computer bulletin board systems and information about those systems.
+computer bulletin board systems together with information about those systems.
The second data file, called @file{inventory-shipped}, contains
information about shipments on a monthly basis. Each line of these
files is one @dfn{record}.
@@ -624,7 +743,7 @@ files is one @dfn{record}.
In the file @file{BBS-list}, each record contains the name of a computer
bulletin board, its phone number, the board's baud rate, and a code for
the number of hours it is operational. An @samp{A} in the last column
-means the board operates 24 hours all week. A @samp{B} in the last
+means the board operates 24 hours a day. A @samp{B} in the last
column means the board operates evening and weekend hours, only. A
@samp{C} means the board operates only on weekends.
@@ -651,7 +770,7 @@ also one record. Each record contains the month of the year, the number
of green crates shipped, the number of red boxes shipped, the number of
orange bags shipped, and the number of blue packages shipped,
respectively. There are 16 entries, covering the 12 months of one year
-and 4 months of the next year.
+and 4 months of the next year.@refill
@group
@example
@@ -784,7 +903,7 @@ empty action that does nothing; then no lines would be printed.
@cindex how @code{awk} works
The @code{awk} utility reads the input files one line at a
-time. For each line, @code{awk} tries the patterns of all the rules.
+time. For each line, @code{awk} tries the patterns of each of the rules.
If several patterns match then several actions are run, in the order in
which they appear in the @code{awk} program. If no patterns match, then
no actions are run.
@@ -859,12 +978,12 @@ ls -l | awk '$5 == "Nov" @{ sum += $4 @}
This command prints the total number of bytes in all the files in the
current directory that were last modified in November (of any year).
(In the C shell you would need to type a semicolon and then a backslash
-at the end of the first line; in the Bourne shell or the Bourne-Again
-shell, you can type the example as shown.)
+at the end of the first line; in a @sc{POSIX}-compliant shell, such as the
+Bourne shell or the Bourne-Again shell, you can type the example as shown.)
The @w{@samp{ls -l}} part of this example is a command that gives you a full
listing of all the files in a directory, including file size and date.
-Its output looks like this:
+Its output looks like this:@refill
@example
-rw-r--r-- 1 close 1933 Nov 7 13:05 Makefile
@@ -892,7 +1011,8 @@ matches the string @samp{Nov}. Each time a line has the string
performed. This adds the fourth field (the file size) to the variable
@code{sum}. As a result, when @code{awk} has finished reading all the
input lines, @code{sum} is the sum of the sizes of files whose
-lines matched the pattern.@refill
+lines matched the pattern. (This works because @code{awk} variables
+are automatically initialized to zero.)@refill
After the last line of output from @code{ls} has been processed, the
@code{END} rule is executed, and the value of @code{sum} is
@@ -905,10 +1025,23 @@ displays your output. By manipulating fields and using @code{print}
statements, you can produce some very useful and spectacular looking
reports.@refill
-
@node Running gawk, Comments, More Complex, Getting Started
@section How to Run @code{awk} Programs
+@ignore
+Date: Mon, 26 Aug 91 09:48:10 +0200
+From: gatech!vsoc07.cern.ch!matheys (Jean-Pol Matheys (CERN - ECP Division))
+To: uunet.UU.NET!skeeve!arnold
+Subject: RE: status check
+
+The introduction of Chapter 2 (i.e. before 2.1) should include
+the whole of section 2.4 - it's better to tell people how to run awk programs
+before giving any examples
+
+ADR --- he's right. but for now, don't do this because the rest of the
+chapter would need some rewriting.
+@end ignore
+
@cindex command line formats
@cindex running @code{awk} programs
There are several ways to run an @code{awk} program. If the program is
@@ -923,7 +1056,7 @@ awk '@var{program}' @var{input-file1} @var{input-file2} @dots{}
where @var{program} consists of a series of patterns and actions, as
described earlier.
-When the program is long, you would probably prefer to put it in a file
+When the program is long, it is usually more convenient to put it in a file
and run it with a command like this:
@example
@@ -953,13 +1086,12 @@ where @var{program} consists of a series of @var{patterns} and
@var{actions}, as described earlier.
@cindex single quotes, why needed
-This command format tells the shell to start @code{awk} and use the
+This command format instructs the shell to start @code{awk} and use the
@var{program} to process records in the input file(s). There are single
-quotes around the @var{program} so that the shell doesn't interpret any
-@code{awk} characters as special shell characters. They cause the
+quotes around @var{program} so that the shell doesn't interpret any
+@code{awk} characters as special shell characters. They also cause the
shell to treat all of @var{program} as a single argument for
-@code{awk}. They also allow @var{program} to be more than one line
-long.@refill
+@code{awk} and allow @var{program} to be more than one line long.@refill
This format is also useful for running short or medium-sized @code{awk}
programs from shell scripts, because it avoids the need for a separate
@@ -971,7 +1103,7 @@ reliable since there are no other files to misplace.
@cindex standard input
@cindex input, standard
-You can also use @code{awk} without any input files. If you type the
+You can also run @code{awk} without any input files. If you type the
command line:@refill
@example
@@ -1037,7 +1169,7 @@ more convenient to put the program into a separate file. To tell
awk -f @var{source-file} @var{input-file1} @var{input-file2} @dots{}
@end example
-The @samp{-f} tells the @code{awk} utility to get the @code{awk} program
+The @samp{-f} instructs the @code{awk} utility to get the @code{awk} program
from the file @var{source-file}. Any file name can be used for
@var{source-file}. For example, you could put the program:@refill
@@ -1063,7 +1195,9 @@ awk '/th/'
which was explained earlier (@pxref{Read Terminal}). Note that you
don't usually need single quotes around the file name that you specify
with @samp{-f}, because most file names don't contain any of the shell's
-special characters.
+special characters. Notice that in @file{th-prog}, the @code{awk}
+program did not have single quotes around it. The quotes are only needed
+for programs that are provided on the @code{awk} command line.
If you want to identify your @code{awk} program files clearly as such,
you can add the extension @file{.awk} to the file name. This doesn't
@@ -1081,7 +1215,9 @@ affect the execution of the @code{awk} program, but it does make
Once you have learned @code{awk}, you may want to write self-contained
@code{awk} scripts, using the @samp{#!} script mechanism. You can do
-this on BSD Unix systems and (someday) on GNU.
+this on many Unix systems @footnote{The @samp{#!} mechanism works on
+Unix systems derived from BSD Unix, System V Release 4, and some System
+V Release 3 systems.} and (someday) on GNU.@refill
For example, you could create a text file named @file{hello}, containing
the following (where @samp{BEGIN} is a feature we have not yet
@@ -1103,8 +1239,14 @@ hello
@end example
@noindent
-at the shell, and the system will arrange to run @code{awk} as if you
-had typed:
+at the shell, and the system will arrange to run @code{awk} @footnote{The
+line beginning with @samp{#!} lists the full pathname of an interpreter
+to be run, and an optional initial command line argument to pass to that
+interpreter. The operating system then runs the interpreter with the given
+argument and the full argument list of the executed program. The first argument
+in the list is the full pathname of the @code{awk} program. The rest of the
+argument list will either be options to @code{awk}, or data files,
+or both.} as if you had typed:@refill
@example
awk -f hello
@@ -1128,7 +1270,7 @@ awk '@var{program}' "$@@"
Using this technique, it is @emph{vital} to enclose the @var{program} in
single quotes to protect it from interpretation by the shell. If you
-omit the quotes, only a shell wizard can predict the result.
+omit the quotes, only a shell wizard can predict the results.
The @samp{"$@@"} causes the shell to forward all the command line
arguments to the @code{awk} program, without interpretation. The first
@@ -1136,17 +1278,9 @@ line, which starts with a colon, is used so that this shell script will
work even if invoked by a user who uses the C shell.
@c Someday: (See @cite{The Bourne Again Shell}, by ??.)
-@c We don't refer to hoarded information.
-@c (See
-@c @cite{The UNIX Programming Environment} by Brian Kernighan and Rob Pike,
-@c Prentice-Hall, 1984, for more information on writing shell programs that
-@c use the Unix utilities. The most powerful version of the shell is the
-@c Korn shell. A detailed description of the Korn shell can be found in
-@c @cite{The KornShell Command and Programming Language} by Morris Bolsky
-@c and David Korn, Prentice-Hall, 1989.)
-
@node Comments, Statements/Lines, Running gawk, Getting Started
@section Comments in @code{awk} Programs
+@cindex @samp{#}
@cindex comments
@cindex use of comments
@cindex documenting @code{awk} programs
@@ -1156,7 +1290,7 @@ A @dfn{comment} is some text that is included in a program for the sake
of human readers, and that is not really part of the program. Comments
can explain what the program does, and how it works. Nearly all
programming languages have provisions for comments, because programs are
-hard to understand without their extra help.
+typically hard to understand without their extra help.
In the @code{awk} language, a comment starts with the sharp sign
character, @samp{#}, and continues to the end of the line. The
@@ -1172,7 +1306,7 @@ For example, we could have put the following into @file{th-prog}:@refill
You can put comment lines into keyboard-composed throw-away @code{awk}
programs also, but this usually isn't very useful; the purpose of a
comment is to help you or another person understand the program at
-another time.
+a later time.@refill
@node Statements/Lines, When, Comments, Getting Started
@section @code{awk} Statements versus Lines
@@ -1187,7 +1321,7 @@ awk '/12/ @{ print $0 @}
But sometimes statements can be more than one line, and lines can
contain several statements. You can split a statement into multiple
-lines by inserting a newline after any of the following:
+lines by inserting a newline after any of the following:@refill
@example
, @{ ? : || && do else
@@ -1195,6 +1329,9 @@ lines by inserting a newline after any of the following:
@noindent
A newline at any other point is considered the end of the statement.
+(Splitting lines after @samp{?} and @samp{:} is a minor @code{gawk}
+extension. The @samp{?} and @samp{:} referred to here is the
+three operand coditional expression described in @ref{Conditional Exp}.)
@cindex backslash continuation
@cindex continuation of lines
@@ -1211,25 +1348,30 @@ awk '/This program is too long, so continue it\
@noindent
We have generally not used backslash continuation in the sample programs in
-this manual. Since there is no limit on the length of a line, it is never
-strictly necessary; it just makes programs prettier. We have preferred to
-make them even more pretty by keeping the statements short. Backslash
-continuation is most useful when your @code{awk} program is in a separate
-source file, instead of typed in on the command line.
+this manual. Since in @code{gawk} there is no limit on the length of a line,
+it is never strictly necessary; it just makes programs prettier. We have
+preferred to make them even more pretty by keeping the statements short.
+Backslash continuation is most useful when your @code{awk} program is in a
+separate source file, instead of typed in on the command line. You should
+also note that many @code{awk} implementations are more picky about where
+you may use backslash continuation. For maximal portability of your @code{awk}
+programs, it is best not to split your lines in the middle of a regular
+expression or a string.@refill
@strong{Warning: backslash continuation does not work as described above
with the C shell.} Continuation with backslash works for @code{awk}
programs in files, and also for one-shot programs @emph{provided} you
-are using the Bourne shell or the Bourne-again shell. But the C shell
-used on Berkeley Unix behaves differently! There, you must use two
-backslashes in a row, followed by a newline.@refill
+are using a @sc{POSIX}-compliant shell, such as the Bourne shell or the
+Bourne-again shell. But the C shell used on Berkeley Unix behaves
+differently! There, you must use two backslashes in a row, followed by
+a newline.@refill
@cindex multiple statements on one line
When @code{awk} statements within one rule are short, you might want to put
more than one of them on a line. You do this by separating the statements
-with semicolons, @samp{;}.
+with a semicolon, @samp{;}.
This also applies to the rules themselves.
-Thus, the above example program could have been written:@refill
+Thus, the previous program could have been written:@refill
@example
/12/ @{ print $0 @} ; /21/ @{ print $0 @}
@@ -1251,7 +1393,7 @@ programs, more advanced patterns, field separators, arithmetic
statements, and other selection criteria, you can produce much more
complex output. The @code{awk} language is very useful for producing
reports from large amounts of raw data, such as summarizing information
-from the output of other utility programs such as @code{ls}.
+from the output of other utility programs like @code{ls}.
(@xref{More Complex, , A More Complex Example}.)
Programs written with @code{awk} are usually much smaller than they would
@@ -1286,18 +1428,17 @@ easier to maintain and usually run more efficiently.@refill
@cindex standard input
@vindex FILENAME
In the typical @code{awk} program, all input is read either from the
-standard input (usually the keyboard) or from files whose names you
-specify on the @code{awk} command line. If you specify input files,
-@code{awk} reads data from the first one until it reaches the end; then
-it reads the second file until it reaches the end, and so on. The name
-of the current input file can be found in the built-in variable
-@code{FILENAME} (@pxref{Built-in Variables}).@refill
+standard input (by default the keyboard, but often a pipe from another
+command) or from files whose names you specify on the @code{awk} command
+line. If you specify input files, @code{awk} reads them in order, reading
+all the data frome one before going on to the next. The name of the current
+input file can be found in the built-in variable @code{FILENAME}
+(@pxref{Built-in Variables}).@refill
The input is read in units called @dfn{records}, and processed by the
rules one record at a time. By default, each record is one line. Each
-record read is split automatically into @dfn{fields}, to make it more
-convenient for a rule to work on parts of the record under
-consideration.
+record is split automatically into @dfn{fields}, to make it more
+convenient for a rule to work on its parts.
On rare occasions you will need to use the @code{getline} command,
which can do explicit input from any number of files (@pxref{Getline}).
@@ -1308,6 +1449,7 @@ which can do explicit input from any number of files (@pxref{Getline}).
* Non-Constant Fields:: Non-constant Field Numbers.
* Changing Fields:: Changing the Contents of a Field.
* Field Separators:: The field separator and how to change it.
+* Constant Size:: Reading constant width data.
* Multiple Line:: Reading multi-line records.
* Getline:: Reading files under explicit program control
@@ -1323,18 +1465,18 @@ which can do explicit input from any number of files (@pxref{Getline}).
@cindex record separator
The @code{awk} language divides its input into records and fields.
Records are separated by a character called the @dfn{record separator}.
-By default, the record separator is the newline character. Therefore,
-normally, a record is a line of text.@refill
+By default, the record separator is the newline character, defining
+a record to be a single line of text.@refill
-@c @cindex changing the record separator
+@iftex
+@cindex changing the record separator
+@end iftex
@vindex RS
Sometimes you may want to use a different character to separate your
-records. You can use different characters by changing the built-in
-variable @code{RS}.
-
-The value of @code{RS} is a string that says how to separate records;
-the default value is @code{"\n"}, the string of just a newline
-character. This is why records are, by default, single lines.
+records. You can use a different character by changing the built-in
+variable @code{RS}. The value of @code{RS} is a string that says how
+to separate records; the default value is @code{"\n"}, the string containing
+just a newline character. This is why records are, by default, single lines.
@code{RS} can have any string as its value, but only the first character
of the string is used as the record separator. The other characters are
@@ -1381,13 +1523,29 @@ Another way to change the record separator is on the command line,
using the variable-assignment feature (@pxref{Command Line}).
@example
-awk '@dots{}' RS="/" @var{source-file}
+awk '@{ print $0 @}' RS="/" BBS-list
@end example
@noindent
-This sets @code{RS} to @samp{/} before processing @var{source-file}.
+This sets @code{RS} to @samp{/} before processing @file{BBS-list}.
+
+Reaching the end of an input file terminates the current input record,
+even if the last character in the file is not the character in @code{RS}.
+
+@ignore
+@c merge the preceding paragraph and this stuff into one paragraph
+@c and put it in an `expert info' section.
+This produces correct behavior in the vast majority of cases, although
+the following (extreme) pipeline prints a suprising @samp{1}. (There
+is one field, consisting of a newline.)
+
+@example
+echo | awk 'BEGIN @{ RS = "a" @} ; @{ print NF @}'
+@end example
-The empty string (a string of no characters) has a special meaning
+@end ignore
+
+The empty string, @code{""} (a string of no characters), has a special meaning
as the value of @code{RS}: it means that records are separated only
by blank lines. @xref{Multiple Line}, for more details.
@@ -1403,7 +1561,7 @@ but is never automatically reset to zero.
If you change the value of @code{RS} in the middle of an @code{awk} run,
the new value is used to delimit subsequent records, but the record
-currently being processed (and records already finished) are not
+currently being processed (and records already processed) are not
affected.
@node Fields, Non-Constant Fields, Records, Reading Files
@@ -1413,13 +1571,13 @@ affected.
@cindex fields
@cindex accessing fields
When @code{awk} reads an input record, the record is
-automatically separated or @dfn{parsed} by the interpreter into pieces
+automatically separated or @dfn{parsed} by the interpreter into chunks
called @dfn{fields}. By default, fields are separated by whitespace,
like words in a line.
Whitespace in @code{awk} means any string of one or more spaces and/or
tabs; other characters such as newline, formfeed, and so on, that are
considered whitespace by other languages are @emph{not} considered
-whitespace by @code{awk}.
+whitespace by @code{awk}.@refill
The purpose of fields is to make it more convenient for you to refer to
these pieces of the record. You don't have to use them---you can
@@ -1449,7 +1607,7 @@ represented by @code{$NF}. So, in the example above, @code{$NF} would
be the same as @code{$7}, which is @samp{example.}. Why this works is
explained below (@pxref{Non-Constant Fields}). If you try to refer to a
field beyond the last one, such as @code{$8} when the record has only 7
-fields, you get the empty string.
+fields, you get the empty string.@refill
@vindex NF
@cindex number of fields, @code{NF}
@@ -1458,7 +1616,7 @@ is the number of fields in the current record.
@code{$0}, which looks like an attempt to refer to the zeroth field, is
a special case: it represents the whole input record. This is what you
-would use when you aren't interested in fields.
+would use if you weren't interested in fields.
Here are some more examples:
@@ -1470,7 +1628,7 @@ awk '$1 ~ /foo/ @{ print $0 @}' BBS-list
This example prints each record in the file @file{BBS-list} whose first
field contains the string @samp{foo}. The operator @samp{~} is called a
@dfn{matching operator} (@pxref{Comparison Ops}); it tests whether a
-string (here, the field @code{$1}) contains a match for a given regular
+string (here, the field @code{$1}) matches a given regular
expression.@refill
By contrast, the following example:
@@ -1539,7 +1697,7 @@ You can change the contents of a field as seen by @code{awk} within an
current input record. (The actual input is untouched: @code{awk} never
modifies the input file.)
-Look at this example:
+Consider this example:
@example
awk '@{ $3 = $2 - 10; print $2, $3 @}' inventory-shipped
@@ -1613,13 +1771,38 @@ should print @samp{everything is normal}, because @code{NF+1} is certain
to be out of range. (@xref{If Statement}, for more information about
@code{awk}'s @code{if-else} statements.)
-@node Field Separators, Multiple Line, Changing Fields, Reading Files
+It is important to note that assigning to a field will change the
+value of @code{$0}, but will not change the value of @code{NF},
+even when you assign the null string to a field. For example:
+
+@example
+echo a b c d | awk '@{ OFS = ":"; $2 = "" ; print ; print NF @}'
+@end example
+
+@noindent
+prints
+
+@example
+a::c:d
+4
+@end example
+
+@noindent
+The field is still there, it just has an empty value. You can tell
+because there are two colons in a row.
+
+@node Field Separators, Constant Size, Changing Fields, Reading Files
@section Specifying How Fields Are Separated
@vindex FS
@cindex fields, separating
@cindex field separator, @code{FS}
@cindex @samp{-F} option
+(This section is rather long; it describes one of the most fundamental
+operations in @code{awk}. If you are a novice with @code{awk}, we
+recommend that you re-read this section after you have studied the
+section on regular expressions, @ref{Regexp}.)
+
The way @code{awk} splits an input record into fields is controlled by
the @dfn{field separator}, which is a single character or a regular
expression. @code{awk} scans the input record for matches for the
@@ -1635,8 +1818,8 @@ would be split into three fields: @samp{m}, @samp{@ g} and @samp{@ gai@
pan}.
The field separator is represented by the built-in variable @code{FS}.
-Shell programmers take note! @code{awk} does not use the name
-@code{IFS} which is used by the shell.@refill
+Shell programmers take note! @code{awk} does not use the name @code{IFS}
+which is used by the shell.@refill
You can change the value of @code{FS} in the @code{awk} program with the
assignment operator, @samp{=} (@pxref{Assignment Ops}). Often the right
@@ -1658,7 +1841,7 @@ John Q. Smith, 29 Oak St., Walamazoo, MI 42139
@end example
@noindent
-this @code{awk} program extracts the string @samp{29 Oak St.}.
+this @code{awk} program extracts the string @samp{@ 29 Oak St.}.
@cindex field separator, choice of
@cindex regular expressions as field separators
@@ -1673,8 +1856,8 @@ John Q. Smith, LXIX, 29 Oak St., Walamazoo, MI 42139
@end example
@noindent
-the previous sample program would extract @samp{LXIX}, instead of
-@samp{29 Oak St.}. If you were expecting the program to print the
+the previous sample program would extract @samp{@ LXIX}, instead of
+@samp{@ 29 Oak St.}. If you were expecting the program to print the
address, you would be surprised. So choose your data layout and
separator characters carefully to prevent such problems.
@@ -1728,14 +1911,33 @@ a capital @samp{F}. Contrast this with @samp{-f}, which specifies a file
containing an @code{awk} program. Case is significant in command options:
the @samp{-F} and @samp{-f} options have nothing to do with each other.
You can use both options at the same time to set the @code{FS} argument
-@emph{and} get an @code{awk} program from a file.
+@emph{and} get an @code{awk} program from a file.@refill
+
+@c begin expert info
+The value used for the argument to @samp{-F} is processed in exactly the
+same way as assignments to the built-in variable @code{FS}. This means that
+if the field separator contains special characters, they must be escaped
+appropriately. For example, to use a @samp{\} as the field separator, you
+would have to type:
+
+@example
+# same as FS = "\\"
+awk -F\\\\ '@dots{}' files @dots{}
+@end example
+
+@noindent
+Since @samp{\} is used for quoting in the shell, @code{awk} will see
+@samp{-F\\}. Then @code{awk} processes the @samp{\\} for escape
+characters (@pxref{Constants}), finally yielding a single @samp{\} to
+be used for the field separator.
+@c end expert info
As a special case, in compatibility mode (@pxref{Command Line}), if the
argument to @samp{-F} is @samp{t}, then @code{FS} is set to the tab
character. (This is because if you type @samp{-F\t}, without the quotes,
at the shell, the @samp{\} gets deleted, so @code{awk} figures that you
really want your fields to be separated with tabs, and not @samp{t}s.
-Use @samp{FS="t"} on the command line if you really do want to separate
+Use @samp{-v FS="t"} on the command line if you really do want to separate
your fields with @samp{t}s.)
For example, let's use an @code{awk} program file called @file{baud.awk}
@@ -1798,7 +2000,192 @@ separator. Note that fields in @file{/etc/passwd} are separated by
colons. The second field represents a user's encrypted password, but if
the field is empty, that user has no password.
-@node Multiple Line, Getline, Field Separators, Reading Files
+@c begin expert info
+According to the @sc{POSIX} standard, @code{awk} is supposed to behave
+as if each record is split into fields at the time that it is read.
+In particular, this means that you can change the value of @code{FS}
+after a record is read, but before any of the fields are referenced.
+The value of the fields (i.e. how they were split) should reflect the
+old value of @code{FS}, not the new one.
+
+However, many implementations of @code{awk} do not do this. Instead,
+they defer splitting the fields until a field reference actually happens,
+using the @emph{current} value of @code{FS}! This behavior can be difficult
+to diagnose. The following example illustrates the results of the two methods.
+(The @code{sed} command prints just the first line of @file{/etc/passwd}.)
+
+@example
+sed 1q /etc/passwd | awk '@{ FS = ":" ; print $1 @}'
+@end example
+
+@noindent
+will usually print
+
+@example
+root
+@end example
+
+@noindent
+on an incorrect implementation of @code{awk}, while @code{gawk}
+will print something like
+
+@example
+root:nSijPlPhZZwgE:0:0:Root:/:
+@end example
+@c end expert info
+
+@c begin expert info
+There is an important difference between the two cases of @samp{FS = @w{" "}}
+and @samp{FS = @w{"[ \t]+"}} (which is a regular expression matching one or
+more blanks or tabs). For both values of @code{FS}, fields are separated
+by runs of blanks and/or tabs. However, when the value of @code{FS} is
+@code{" "}, @code{awk} will strip leading and trailing whitespace from the
+record, and then decide where the fields are.
+
+@example
+echo ' a b c d ' | awk '@{ print $2 @}'
+@end example
+
+@noindent
+prints @samp{b}. However, this example
+
+@example
+echo ' a b c d ' | awk 'BEGIN @{ FS = "[ \t]+" @} ; @{ print $2 @}'
+@end example
+
+@noindent
+prints @samp{a}. In this case, the first field is null.
+
+The stripping of leading and trailing whitespace also comes into
+play whenever @code{$0} is recomputed. For instance, this pipeline
+
+@example
+echo ' a b c d' | awk '@{ print; $2 = $2; print @}'
+@end example
+
+@noindent
+produces this output:
+
+@example
+ a b c d
+a b c d
+@end example
+
+@noindent
+The first @code{print} statement prints the record as it was read,
+with leading whitespace intact. The assignment to @code{$2} rebuilds
+@code{$0} by concatentating @code{$1} through @code{$NF} together,
+separated by the value of @code{OFS}. Since the leading whitespace
+was ignored when finding @code{$1}, it is not part of the new @code{$0}.
+Finally, the last @code{print} statement prints the new @code{$0}.
+@c end expert info
+
+The following table summarizes how fields are split, based on the
+value of @code{FS}.
+
+@table @code
+@item FS == " "
+Fields are separated by runs of whitespace. Leading and trailing
+whitespace are ignored. This is the default.
+
+@item FS == @var{any single character}
+Fields are separated by each occurrence of the character. Multiple
+successive occurrences delimit empty fields, as do leading and
+trailing occurrences.
+
+@item FS == @var{regexp}
+Fields are separated by occurrences of characters that match @var{regexp}.
+Leading and trailing matches of @var{regexp} delimit empty fields.
+@end table
+
+@node Constant Size, Multiple Line, Field Separators, Reading Files
+@section Reading Constant Width Data
+
+(This section discusses an advanced, experimental feature. If you are
+a novice @code{awk} user, you may wish to skip it on the first reading.)
+
+@code{gawk} 2.13 introduced a new facility for dealing with fixed-width fields
+with no distinctive field separator. Data of this nature arises typically
+in one of at least two ways: the input for old FORTRAN programs where
+numbers are run together, and the output of programs that did not anticipate
+the use of their output as input for other programs.
+
+An example of the latter is a table where all the columns are lined up by
+the use of a variable number of spaces and @emph{empty fields are just
+spaces}. Clearly, @code{awk}'s normal field splitting based on @code{FS}
+will not work well in this case. (Although a portable @code{awk} program
+can use a series of @code{substr} calls on @code{$0}, this is awkward and
+inefficient for a large number of fields.)@refill
+
+The splitting of an input record into fixed-width fields is specified by
+assigning a string containing space-separated numbers to the built-in
+variable @code{FIELDWIDTHS}. Each number specifies the width of the field
+@emph{including} columns between fields. If you want to ignore the columns
+between fields, you can specify the width as a separate field that is
+subsequently ignored.
+
+The following data is the output of the @code{w} utility. It is useful
+to illustrate the use of @code{FIELDWIDTHS}.
+
+@example
+ 10:06pm up 21 days, 14:04, 23 users, load average: 1.21, 1.36, 1.43
+User tty login@ idle JCPU PCPU what
+hzuo ttyV0 8:58pm 9 5 vi p24.tex
+hzang ttyV3 6:37pm 50 -csh
+eklye ttyV5 9:53pm 7 1 em thesis_11jul91.tex
+dportein ttyV6 8:17pm 1:47 -csh
+gierd ttyD3 10:00pm 1 elm
+dave ttyD4 9:47pm 4 4 w
+brent ttyp0 26Jun91 4:46 26:46 4:41 bash
+dave ttyq4 26Jun9115days 46 46 wnewmail
+@end example
+
+The following program takes the above input, converts the idle time to
+number of seconds and prints out the first two fields and the calculated
+idle time. (This program uses a number of @code{awk} features that
+haven't been introduced yet.)@refill
+
+@example
+BEGIN @{ FIELDWIDTHS = "9 6 10 6 7 7 35" @}
+NR > 2 @{
+ idle = $4
+ sub(/^ */, "", idle) # strip leading spaces
+ if (idle == "") idle = 0
+ if (idle ~ /:/) @{ split(idle, t, ":"); idle = t[1] * 60 + t[2] @}
+ if (idle ~ /days/) @{ idle *= 24 * 60 * 60 @}
+
+ print $1, $2, idle
+ @}
+@end example
+
+Here is the result of running the program on the data:
+
+@example
+hzuo ttyV0 0
+hzang ttyV3 50
+eklye ttyV5 0
+dportein ttyV6 107
+gierd ttyD3 1
+dave ttyD4 0
+brent ttyp0 286
+dave ttyq4 1296000
+@end example
+
+Another (possibly more practical) example of fixed-width input data
+would be the input from a deck of balloting cards. In some parts of
+the United States, voters make their choices by punching holes in computer
+cards. These cards are then processed to count the votes for any particular
+candidate or on any particular issue. Since a voter may choose not to
+vote on some issue, any column on the card may be empty. An @code{awk}
+program for processing such data could use the @code{FIELDWIDTHS} feature
+to simplify reading the data.@refill
+
+@c of course, getting gawk to run on a system with card readers is
+@c another story!
+
+This feature is still experimental, and will likely evolve over time.
+
+@node Multiple Line, Getline, Constant Size, Reading Files
@section Multiple-Line Records
@cindex multiple line records
@@ -1819,7 +2206,7 @@ records. For example, you could use the formfeed character (written
a page of the file. To do this, just set the variable @code{RS} to
@code{"\f"} (a string containing the formfeed character). Any
other character could equally well be used, as long as it won't be part
-of the data in a record.
+of the data in a record.@refill
@ignore
Another technique is to have blank lines separate records. The string
@@ -1839,22 +2226,23 @@ records are separated by one or more blank lines. If you set @code{RS}
to the null string, a record always ends at the first blank line
encountered. And the next record doesn't start until the first nonblank
line that follows---no matter how many blank lines appear in a row, they
-are considered one record-separator.
+are considered one record-separator. (End of file is also considered
+a record separator.)@refill
The second step is to separate the fields in the record. One way to do
this is to put each field on a separate line: to do this, just set the
variable @code{FS} to the string @code{"\n"}. (This simple regular
expression matches a single newline.)
-Another idea is to divide each of the lines into fields in the normal
-manner. This happens by default as a result of a special feature: when
-@code{RS} is set to the null string, the newline character @emph{always}
-acts as a field separator. This is in addition to whatever field
-separations result from @code{FS}.
+Another way to separate fields is to divide each of the lines into fields
+in the normal manner. This happens by default as a result of a special
+feature: when @code{RS} is set to the null string, the newline character
+@emph{always} acts as a field separator. This is in addition to whatever
+field separations result from @code{FS}.
The original motivation for this special exception was probably so that
-you get useful behavior in the default case (i.e., @w{@code{FS == "
-"}}). This feature can be a problem if you really don't want the
+you get useful behavior in the default case (i.e., @w{@code{FS == " "}}).
+This feature can be a problem if you really don't want the
newline character to separate fields, since there is no way to
prevent it. However, you can work around this by using the @code{split}
function to break up the record manually (@pxref{String Functions}).
@@ -1893,7 +2281,7 @@ So far we have been getting our input files from @code{awk}'s main
input stream---either the standard input (usually your terminal) or the
files specified on the command line. The @code{awk} language has a
special built-in command called @code{getline} that
-can be used to read input under your explicit control.
+can be used to read input under your explicit control.@refill
This command is quite complex and should @emph{not} be used by
beginners. It is covered here because this is the chapter on input.
@@ -1921,17 +2309,17 @@ example:@refill
@example
awk '@{
if (t = index($0, "/*")) @{
- if(t > 1)
+ if (t > 1)
tmp = substr($0, 1, t - 1)
else
tmp = ""
u = index(substr($0, t + 2), "*/")
- while (! u) @{
+ while (u == 0) @{
getline
t = -1
u = index($0, "*/")
@}
- if(u <= length($0) - 2)
+ if (u <= length($0) - 2)
$0 = tmp substr($0, t + u + 3)
else
$0 = tmp
@@ -1940,11 +2328,15 @@ awk '@{
@}'
@end example
-This @code{awk} program deletes all comments, @samp{/* @dots{}
+This @code{awk} program deletes all C-style comments, @samp{/* @dots{}
*/}, from the input. By replacing the @samp{print $0} with other
statements, you could perform more complicated processing on the
-decommented input, such as searching it for matches for a regular
-expression.
+decommented input, such as searching it for matches of a regular
+expression. (This program has a subtle problem---can you spot it?)
+
+@c the program to remove comments doesn't work if one
+@c comment ends and another begins on the same line. (Your
+@c idea for restart would be useful here). --- brennan@boeing.com
This form of the @code{getline} command sets @code{NF} (the number of
fields; @pxref{Fields}), @code{NR} (the number of records read so far;
@@ -2023,11 +2415,11 @@ file.@refill
@example
awk '@{
-if ($1 == 10) @{
- getline < "foo.input"
- print
-@} else
- print
+ if ($1 == 10) @{
+ getline < "foo.input"
+ print
+ @} else
+ print
@}'
@end example
@@ -2045,7 +2437,7 @@ the new record is tested against any subsequent rules, just as when
@item getline @var{var} < @var{file}
This form of the @code{getline} function takes its input from the file
@var{file} and puts it in the variable @var{var}. As above, @var{file}
-is a string-valued expression that specifies the file to read from.
+is a string-valued expression that specifies the file from which to read.
In this version of @code{getline}, none of the built-in variables are
changed, and the record is not split into fields. The only variable
@@ -2069,11 +2461,11 @@ awk '@{
Note here how the name of the extra input file is not built into
the program; it is taken from the data, from the second field on
-the @samp{@@include} line.
+the @samp{@@include} line.@refill
The @code{close} function is called to ensure that if two identical
@samp{@@include} lines appear in the input, the entire specified file is
-included twice. @xref{Close Input}.
+included twice. @xref{Close Input}.@refill
One deficiency of this program is that it does not process nested
@samp{@@include} statements the way a true macro preprocessor would.
@@ -2134,7 +2526,7 @@ bletch
@noindent
Notice that this program ran the command @code{who} and printed the result.
(If you try this program yourself, you will get different results, showing
-you logged in.)
+you who is logged in on your system.)
This variation of @code{getline} splits the record into fields, sets the
value of @code{NF} and recomputes the value of @code{$0}. The values of
@@ -2144,7 +2536,7 @@ value of @code{NF} and recomputes the value of @code{$0}. The values of
The output of the command @var{command} is sent through a pipe to
@code{getline} and into the variable @var{var}. For example, the
following program reads the current date and time into the variable
-@code{current_time}, using the utility called @code{date}, and then
+@code{current_time}, using the @code{date} utility, and then
prints it.@refill
@group
@@ -2174,9 +2566,9 @@ The next time the same file or command is used in @code{getline}, another
record is read from it, and so on.
This implies that if you want to start reading the same file again from
-the beginning, or if you want to rerun a shell command (rather that
+the beginning, or if you want to rerun a shell command (rather than
reading more output from the command), you must take special steps.
-What you can do is use the @code{close} function, as follows:
+What you must do is use the @code{close} function, as follows:
@example
close(@var{filename})
@@ -2221,6 +2613,7 @@ Both are described in this chapter.
* Print:: The @code{print} statement.
* Print Examples:: Simple examples of @code{print} statements.
* Output Separators:: The output separators and how to change them.
+* OFMT:: Controlling Numeric Output With @code{print}.
* Printf:: The @code{printf} statement.
* Redirection:: How to redirect output to multiple files and pipes.
* Special Files:: File name interpretation in @code{gawk}. @code{gawk}
@@ -2251,11 +2644,12 @@ relational operator; otherwise it could be confused with a redirection
The items printed can be constant strings or numbers, fields of the
current record (such as @code{$1}), variables, or any @code{awk}
expressions. The @code{print} statement is completely general for
-computing @emph{what} values to print. With one exception
-(@pxref{Output Separators}), what you can't do is specify @emph{how} to
-print them---how many columns to use, whether to use exponential
-notation or not, and so on. For that, you need the @code{printf}
-statement (@pxref{Printf}).
+computing @emph{what} values to print. With two exceptions
+(@pxref{Output Separators},
+and @pxref{OFMT}), what you can't do is
+specify @emph{how} to print them---how many columns to use, whether to
+use exponential notation or not, and so on. For that, you need the
+@code{printf} statement (@pxref{Printf}).@refill
The simple statement @samp{print} with no items is equivalent to
@samp{print $0}: it prints the entire current record. To print a blank
@@ -2332,9 +2726,8 @@ Neither example's output makes much sense to someone unfamiliar with the
file @file{inventory-shipped}. A heading line at the beginning would make
it clearer. Let's add some headings to our table of months (@code{$1}) and
green crates shipped (@code{$2}). We do this using the @code{BEGIN} pattern
-(@pxref{BEGIN/END}) to cause the headings to be printed only once:
+(@pxref{BEGIN/END}) to force the headings to be printed only once:
-@c the formatting is strange here because the @{ becomes just a brace.
@example
awk 'BEGIN @{ print "Month Crates"
print "----- ------" @}
@@ -2370,9 +2763,9 @@ complicated when you have many columns to fix. Counting spaces for two
or three columns can be simple, but more than this and you can get
``lost'' quite easily. This is why the @code{printf} statement was
created (@pxref{Printf}); one of its specialties is lining up columns of
-data.
+data.@refill
-@node Output Separators, Printf, Print Examples, Printing
+@node Output Separators, OFMT, Print Examples, Printing
@section Output Separators
@cindex output field separator, @code{OFS}
@@ -2385,7 +2778,7 @@ separated by single spaces. But they do not have to be spaces; a
single space is only the default. You can specify any string of
characters to use as the @dfn{output field separator} by setting the
built-in variable @code{OFS}. The initial value of this variable
-is the string @w{@code{" "}}.
+is the string @w{@code{" "}}, that is, just a single space.@refill
The output from an entire @code{print} statement is called an
@dfn{output record}. Each @code{print} statement outputs one output
@@ -2414,7 +2807,33 @@ If the value of @code{ORS} does not contain a newline, all your output
will be run together on a single line, unless you output newlines some
other way.
-@node Printf, Redirection, Output Separators, Printing
+@node OFMT, Printf, Output Separators, Printing
+@section Controlling Numeric Output With @code{print}
+@vindex OFMT
+When you use the @code{print} statement to print numeric values,
+@code{awk} internally converts the number to a string of characters,
+and prints that string. @code{awk} uses the @code{sprintf} function
+to do this conversion. For now, it suffices to say that the @code{sprintf}
+function accepts a @dfn{format specification} that tells it how to format
+numbers (or strings), and that there are a number of different ways that
+numbers can be formatted. The different format specifications are discussed
+more fully in @ref{Printf}.@refill
+
+The built-in variable @code{OFMT} contains the default format specification
+that @code{print} uses with @code{sprintf} when it wants to convert a
+number to a string for printing. By supplying different format specifications
+as the value of @code{OFMT}, you can change how @code{print} will print
+your numbers. As a brief example:
+
+@example
+awk 'BEGIN @{ OFMT = "%d" # print numbers as integers
+ print 17.23 @}'
+@end example
+
+@noindent
+will print @samp{17}.
+
+@node Printf, Redirection, OFMT, Printing
@section Using @code{printf} Statements For Fancier Printing
@cindex formatted output
@cindex output, formatted
@@ -2455,21 +2874,21 @@ relational operator; otherwise it could be confused with a redirection
@cindex format string
The difference between @code{printf} and @code{print} is the argument
-@var{format}. This is an expression whose value is taken as a string; its
-job is to say how to output each of the other arguments. It is called
+@var{format}. This is an expression whose value is taken as a string; it
+specifies how to output each of the other arguments. It is called
the @dfn{format string}.
-The format string is essentially the same as in the C library function
+The format string is the same as in the @sc{ANSI} C library function
@code{printf}. Most of @var{format} is text to be output verbatim.
Scattered among this text are @dfn{format specifiers}, one per item.
Each format specifier says to output the next item at that place in the
format.@refill
The @code{printf} statement does not automatically append a newline to its
-output. It outputs nothing but what the format specifies. So if you want
+output. It outputs only what the format specifies. So if you want
a newline, you must include one in the format. The output separator
variables @code{OFS} and @code{ORS} have no effect on @code{printf}
-statements.
+statements.@refill
@node Control Letters, Format Modifiers, Basic Printf, Printf
@subsection Format-Control Letters
@@ -2481,7 +2900,7 @@ A format specifier starts with the character @samp{%} and ends with a
to output one item. (If you actually want to output a @samp{%}, write
@samp{%%}.) The format-control letter specifies what kind of value to
print. The rest of the format specifier is made up of optional
-@dfn{modifiers} which are parameters such as the field width to use.
+@dfn{modifiers} which are parameters such as the field width to use.@refill
Here is a list of the format-control letters:
@@ -2514,8 +2933,27 @@ discussed below.
This prints a number in floating point notation.
@item g
-This prints either scientific notation or floating point notation, whichever
-is shorter.
+This prints a number in either scientific notation or floating point
+notation, whichever uses fewer characters.
+@ignore
+From: gatech!ames!elroy!cit-vax!EQL.Caltech.Edu!rankin (Pat Rankin)
+
+In the description of printf formats (p.43), the information for %g
+is incorrect (mainly, it's too much of an oversimplification). It's
+wrong in the AWK book too, and in the gawk man page. I suggested to
+David Trueman before 2.13 was released that the latter be revised, so
+that it matched gawk's behavior (rather than trying to change gawk to
+match the docs ;-). The documented description is nice and simple, but
+it doesn't match the actual underlying behavior of %g in the various C
+run-time libraries that gawk relies on. The precision value for g format
+is different than for f and e formats, so it's inaccurate to say 'g' is
+the shorter of 'e' or 'f'. For 'g', precision represents the number of
+significant digits rather than the number of decimal places, and it has
+special rules about how to format numbers with range between 10E-1 and
+10E-4. All in all, it's pretty messy, and I had to add that clumsy
+GFMT_WORKAROUND code because the VMS run-time library doesn't conform to
+the ANSI-C specifications.
+@end ignore
@item o
This prints an unsigned octal integer.
@@ -2582,8 +3020,10 @@ printf "%4s", "foobar"
@end example
@noindent
-prints @samp{foobar}. Preceding the @var{width} with a minus sign causes
-the output to be padded with spaces on the right, instead of on the left.
+prints @samp{foobar}.
+
+Preceding the @var{width} with a minus sign causes the output to be
+padded with spaces on the right, instead of on the left.
@item .@var{prec}
This is a number that specifies the precision to use when printing.
@@ -2593,9 +3033,43 @@ characters from the string that should be printed.
@end table
The C library @code{printf}'s dynamic @var{width} and @var{prec}
-capability (for example, @code{"%*.*s"}) is not yet supported. However, it can
-easily be simulated using concatenation to dynamically build the
-format string.@refill
+capability (for example, @code{"%*.*s"}) is supported. Instead of
+supplying explicit @var{width} and/or @var{prec} values in the format
+string, you pass them in the argument list. For example:@refill
+
+@example
+w = 5
+p = 3
+s = "abcdefg"
+printf "<%*.*s>\n", w, p, s
+@end example
+
+@noindent
+is exactly equivalent to
+
+@example
+s = "abcdefg"
+printf "<%5.3s>\n", s
+@end example
+
+@noindent
+Both programs output @samp{@w{<@bullet{}@bullet{}abc>}}. (We have
+used the bullet symbol ``@bullet{}'' to represent a space, to clearly
+show you that there are two spaces in the output.)@refill
+
+Earlier versions of @code{awk} did not support this capability. You may
+simulate it by using concatenation to build up the format string,
+like so:@refill
+
+@example
+w = 5
+p = 3
+s = "abcdefg"
+printf "<%" w "." p "s>\n", s
+@end example
+
+@noindent
+This is not particularly easy to read, however.
@node Printf Examples, , Format Modifiers, Printf
@subsection Examples of Using @code{printf}
@@ -2610,7 +3084,7 @@ awk '@{ printf "%-10s %s\n", $1, $2 @}' BBS-list
prints the names of bulletin boards (@code{$1}) of the file
@file{BBS-list} as a string of 10 characters, left justified. It also
prints the phone numbers (@code{$2}) afterward on the line. This
-produces an aligned two-column table of names and phone numbers:
+produces an aligned two-column table of names and phone numbers:@refill
@example
aardvark 555-5553
@@ -2637,7 +3111,7 @@ last things on their lines. We don't need to put spaces after them.
We could make our table look even nicer by adding headings to the tops
of the columns. To do this, use the @code{BEGIN} pattern
-(@pxref{BEGIN/END}) to cause the header to be printed only once, at the
+(@pxref{BEGIN/END}) to force the header to be printed only once, at the
beginning of the @code{awk} program:
@example
@@ -2681,9 +3155,9 @@ section on the @code{print} statement (@pxref{Print}).
@cindex output redirection
@cindex redirection of output
So far we have been dealing only with output that prints to the standard
-output, usually your terminal. Both @code{print} and @code{printf} can be
-told to send their output to other places. This is called
-@dfn{redirection}.@refill
+output, usually your terminal. Both @code{print} and @code{printf} can
+also send their output to other places.
+This is called @dfn{redirection}.@refill
A redirection appears after the @code{print} or @code{printf} statement.
Redirections in @code{awk} are written just like redirections in shell
@@ -2699,7 +3173,7 @@ commands, except that they are written inside the @code{awk} program.
Here are the three forms of output redirection. They are all shown for
the @code{print} statement, but they work identically for @code{printf}
-also.
+also.@refill
@table @code
@item print @var{items} > @var{output-file}
@@ -2770,7 +3244,8 @@ the pipe as soon as all the intended output has been sent to it.
Redirecting output using @samp{>}, @samp{>>}, or @samp{|} asks the system
to open a file or pipe only if the particular @var{file} or @var{command}
-you've specified has not already been written to by your program.@refill
+you've specified has not already been written to by your program, or if
+it has been closed since it was last written to.@refill
@node Close Output, , File/Pipe Redirection, Redirection
@subsection Closing Output Files and Pipes
@@ -2821,7 +3296,7 @@ you can start reading it with @code{getline} (@pxref{Getline}).
@item
To write numerous files, successively, in the same @code{awk}
-program. If you don't close the files, eventually you will exceed the
+program. If you don't close the files, eventually you may exceed a
system limit on the number of open files in one process. So close
each one when you are finished writing it.
@@ -2860,7 +3335,9 @@ is used only for writing error messages; the reason we have two separate
streams, standard output and standard error, is so that they can be
redirected separately.
-@c @cindex differences between @code{gawk} and @code{awk}
+@iftex
+@cindex differences between @code{gawk} and @code{awk}
+@end iftex
In other implementations of @code{awk}, the only way to write an error
message to standard error in an @code{awk} program is as follows:
@@ -2931,6 +3408,15 @@ NF != 4 @{
Recognition of these special file names is disabled if @code{gawk} is in
compatibility mode (@pxref{Command Line}).
+@strong{Note}: Unless your system actually has a @file{/dev/fd} directory,
+the interpretation of these file names is done by @code{gawk} itself.
+For example, using @samp{/dev/fd/4} for output will actually write on
+file descriptor 4, and not on a new file descriptor that was @code{dup}'ed
+from file descriptor 4. Most of the time this does not matter; however, it
+is important to @emph{not} close any of the files related to file descriptors
+0, 1, and 2. If you do close one of these files, unpredictable behavior
+will result.
+
@node One-liners, Patterns, Printing, Top
@chapter Useful ``One-liners''
@@ -2942,9 +3428,9 @@ of the program will give you a good idea of what is going on, but please
read the rest of the manual to become an @code{awk} expert!
@table @code
-@item awk '@{ num_fields = num_fields + NF @}
-@itemx @ @ @ @ @ END @{ print num_fields @}'
-This program prints the total number of fields in all input lines.
+@item awk '@{ if (NF > max) max = NF @}
+@itemx @ @ @ @ @ END @{ print max @}'
+This program prints the maximum number of fields on any input line.
@item awk 'length($0) > 80'
This program prints every line longer than 80 characters. The sole
@@ -2974,6 +3460,21 @@ This program prints the total number of bytes used by @var{files}.
This program prints the maximum line length of @var{file}. The input
is piped through the @code{expand} program to change tabs into spaces,
so the widths compared are actually the right-margin columns.
+
+@item awk 'BEGIN @{ FS = ":" @}
+@itemx @ @ @ @ @ @{ print $1 | "sort" @}' /etc/passwd
+This program prints a sorted list of the login names of all users.
+
+@item awk '@{ nlines++ @}
+@itemx @ @ @ @ @ END@ @{ print nlines @}'
+This programs counts lines in a file.
+
+@item awk 'END @{ print NR @}'
+This program also counts lines in a file, but lets @code{awk} do the work.
+
+@item awk '@{ print NR, $0 @}'
+This program concatenates and line numbers all its input files,
+similar to @samp{cat -n}.
@end table
@node Patterns, Actions, One-liners, Top
@@ -3012,13 +3513,13 @@ Here is a summary of the types of patterns supported in @code{awk}.
@table @code
@item /@var{regular expression}/
A regular expression as a pattern. It matches when the text of the
-input record fits the regular expression. (@xref{Regexp, , Regular
-Expressions as Patterns}.)
+input record fits the regular expression.
+(@xref{Regexp, , Regular Expressions as Patterns}.)@refill
@item @var{expression}
A single expression. It matches when its value, converted to a number,
-is nonzero (if a number) or nonnull (if a string). (@xref{Expression
-Patterns}.)
+is nonzero (if a number) or nonnull (if a string).
+(@xref{Expression Patterns}.)@refill
@item @var{pat1}, @var{pat2}
A pair of patterns separated by a comma, specifying a range of records.
@@ -3030,8 +3531,8 @@ Special patterns to supply start-up or clean-up information to
@code{awk}. (@xref{BEGIN/END}.)
@item @var{null}
-The empty pattern matches every input record. (@xref{Empty, , The Empty
-Pattern}.)
+The empty pattern matches every input record.
+(@xref{Empty, , The Empty Pattern}.)@refill
@end table
@node Empty, Regexp, Kinds of Patterns, Patterns
@@ -3047,7 +3548,7 @@ awk '@{ print $1 @}' BBS-list
@end example
@noindent
-prints just the first field of every record.
+prints the first field of every record.
@node Regexp, Comparison Patterns, Empty, Patterns
@section Regular Expressions as Patterns
@@ -3077,10 +3578,10 @@ classes of strings.
@subsection How to Use Regular Expressions
A regular expression can be used as a pattern by enclosing it in
-slashes. Then the regular expression is matched against the entire text
-of each record. (Normally, it only needs to match some part of the text
-in order to succeed.) For example, this prints the second field of each
-record that contains @samp{foo} anywhere:
+slashes. Then the regular expression is matched against the
+entire text of each record. (Normally, it only needs
+to match some part of the text in order to succeed.) For example, this
+prints the second field of each record that contains @samp{foo} anywhere:
@example
awk '/foo/ @{ print $2 @}' BBS-list
@@ -3094,7 +3595,7 @@ awk '/foo/ @{ print $2 @}' BBS-list
Regular expressions can also be used in comparison expressions. Then
you can specify the string to match against; it need not be the entire
current input record. These comparison expressions can be used as
-patterns or in @code{if} and @code{while} statements.
+patterns or in @code{if}, @code{while}, @code{for}, and @code{do} statements.
@table @code
@item @var{exp} ~ /@var{regexp}/
@@ -3201,7 +3702,7 @@ characters that are enclosed in the square brackets. For example:
@end example
@noindent
-matches any of the characters @samp{M}, @samp{V}, or @samp{X} in a
+matches any one of the characters @samp{M}, @samp{V}, or @samp{X} in a
string.@refill
Ranges of characters are indicated by using a hyphen between the beginning
@@ -3223,29 +3724,25 @@ character set, put a @samp{\} in front of it. For example:
@end example
@noindent
-matches either @samp{]}, or @samp{d}.@refill
+matches either @samp{d}, or @samp{]}.@refill
This treatment of @samp{\} is compatible with other @code{awk}
-implementations but incompatible with the proposed POSIX specification
-for @code{awk}. The current draft specifies the use of the same syntax
-used in @code{egrep}.
-
-We may change @code{gawk} to fit the standard, once we are sure it will
-no longer change. For the meanwhile, the @samp{-a} option specifies the
-traditional @code{awk} syntax described above (which is also the
-default), while the @samp{-e} option specifies @code{egrep} syntax.
-@xref{Options}.
+implementations, and is also mandated by the @sc{POSIX} Command Language
+and Utilities standard. The regular expressions in @code{awk} are a superset
+of the @sc{POSIX} specification for Extended Regular Expressions (EREs).
+@sc{POSIX} EREs are based on the regular expressions accepted by the
+traditional @code{egrep} utility.
In @code{egrep} syntax, backslash is not syntactically special within
square brackets. This means that special tricks have to be used to
represent the characters @samp{]}, @samp{-} and @samp{^} as members of a
character set.
-To match @samp{-}, write it as @samp{---}, which is a range containing
-only @samp{-}. You may also give @samp{-} as the first or last
-character in the set. To match @samp{^}, put it anywhere except as the
-first character of a set. To match a @samp{]}, make it the first
-character in the set. For example:
+In @code{egrep} syntax, to match @samp{-}, write it as @samp{---},
+which is a range containing only @samp{-}. You may also give @samp{-}
+as the first or last character in the set. To match @samp{^}, put it
+anywhere except as the first character of a set. To match a @samp{]},
+make it the first character in the set. For example:@refill
@example
[]d^]
@@ -3257,7 +3754,7 @@ matches either @samp{]}, @samp{d} or @samp{^}.@refill
@item [^ @dots{}]
This is a @dfn{complemented character set}. The first character after
the @samp{[} @emph{must} be a @samp{^}. It matches any characters
-@emph{except} those in the square brackets. For example:
+@emph{except} those in the square brackets (or newline). For example:
@example
[^0-9]
@@ -3335,7 +3832,7 @@ fe?d
@end example
@noindent
-will match @samp{fed} or @samp{fd}, but nothing else.@refill
+will match @samp{fed} and @samp{fd}, but nothing else.@refill
@item \
This is used to suppress the special meaning of a character when
@@ -3402,7 +3899,7 @@ case-insensitive and other rules case-sensitive, because there is no way
to set @code{IGNORECASE} just for the pattern of a particular rule. To
do this, you must use character sets or @code{tolower}. However, one
thing you can do only with @code{IGNORECASE} is turn case-sensitivity on
-or off dynamically for all the rules at once.
+or off dynamically for all the rules at once.@refill
@code{IGNORECASE} can be set on the command line, or in a @code{BEGIN}
rule. Setting @code{IGNORECASE} from the command line is a way to make
@@ -3453,11 +3950,12 @@ True if @var{x} does not match the regular expression described by @var{y}.
The operands of a relational operator are compared as numbers if they
are both numbers. Otherwise they are converted to, and compared as,
-strings (@pxref{Conversion}). Strings are compared by comparing the
-first character of each, then the second character of each, and so on,
-until there is a difference. If the two strings are equal until the
-shorter one runs out, the shorter one is considered to be less than the
-longer one. Thus, @code{"10"} is less than @code{"9"}.
+strings (@pxref{Conversion}, for the detailed rules). Strings are compared
+by comparing the first character of each, then the second character of each,
+and so on, until there is a difference. If the two strings are equal until
+the shorter one runs out, the shorter one is considered to be less than the
+longer one. Thus, @code{"10"} is less than @code{"9"}, and @code{"abc"}
+is less than @code{"abcd"}.@refill
The left operand of the @samp{~} and @samp{!~} operators is a string.
The right operand is either a constant regular expression enclosed in
@@ -3524,7 +4022,7 @@ boolean operators. @xref{Boolean Ops}, for complete information on
the boolean operators.
The subpatterns of a boolean pattern can be constant regular
-expressions, comparisons, or any other @code{gawk} expressions. Range
+expressions, comparisons, or any other @code{awk} expressions. Range
patterns are not expressions, so they cannot appear inside boolean
patterns. Likewise, the special patterns @code{BEGIN} and @code{END},
which never match any input record, are not expressions and cannot
@@ -3533,7 +4031,7 @@ appear inside boolean patterns.
@node Expression Patterns, Ranges, Boolean Patterns, Patterns
@section Expressions as Patterns
-Any @code{awk} expression is valid also as a pattern in @code{gawk}.
+Any @code{awk} expression is also valid as an @code{awk} pattern.
Then the pattern ``matches'' if the expression's value is nonzero (if a
number) or nonnull (if a string).
@@ -3555,10 +4053,10 @@ pattern. @code{/foo/} as an expression has the value 1 if @samp{foo}
appears in the current input record; thus, as a pattern, @code{/foo/}
matches any record containing @samp{foo}.
-Other implementations of @code{awk} are less general than @code{gawk}:
-they allow comparison expressions, and boolean combinations thereof
-(optionally with parentheses), but not necessarily other kinds of
-expressions.
+Other implementations of @code{awk} that are not yet @sc{POSIX} compliant
+are less general than @code{gawk}: they allow comparison expressions, and
+boolean combinations thereof (optionally with parentheses), but not
+necessarily other kinds of expressions.
@node Ranges, BEGIN/END, Expression Patterns, Patterns
@section Specifying Record Ranges With Patterns
@@ -3618,8 +4116,8 @@ awk 'BEGIN @{ print "Analysis of `foo'" @}
@end example
@end group
-This program finds out how many times the string @samp{foo} appears in
-the input file @file{BBS-list}. The @code{BEGIN} rule prints a title
+This program finds the number of records in the input file @file{BBS-list}
+that contain the string @samp{foo}. The @code{BEGIN} rule prints a title
for the report. There is no need to use the @code{BEGIN} rule to
initialize the counter @code{foobar} to zero, as @code{awk} does this
for us automatically (@pxref{Variables}).
@@ -3629,7 +4127,7 @@ record containing the pattern @samp{foo} is read. The @code{END} rule
prints the value of @code{foobar} at the end of the run.@refill
The special patterns @code{BEGIN} and @code{END} cannot be used in ranges
-or with boolean operators.
+or with boolean operators (indeed, they cannot be used with any operators).
An @code{awk} program may have multiple @code{BEGIN} and/or @code{END}
rules. They are executed in the order they appear, all the @code{BEGIN}
@@ -3641,7 +4139,7 @@ library functions, since each library can have its own @code{BEGIN} or
the order in which library functions are named on the command line
controls the order in which their @code{BEGIN} and @code{END} rules are
executed. Therefore you have to be careful to write such rules in
-library files so that it doesn't matter what order they are executed in.
+library files so that the order in which they are executed doesn't matter.
@xref{Command Line}, for more information on using library functions.
If an @code{awk} program only has a @code{BEGIN} rule, and no other
@@ -3656,7 +4154,7 @@ the program. This is necessary in case the @code{END} rule checks the
action for these rules since there is no current record when they run.
@node Actions, Expressions, Patterns, Top
-@chapter Actions: Overview
+@chapter Overview of Actions
@cindex action, definition of
@cindex curly braces
@cindex action, curly braces
@@ -3689,7 +4187,7 @@ contains only one statement, or even if it contains no statements at
all. However, if you omit the action entirely, omit the curly braces as
well. (An omitted action is equivalent to @samp{@{ print $0 @}}.)
-Here are the kinds of statement supported in @code{awk}:
+Here are the kinds of statements supported in @code{awk}:
@itemize @bullet
@item
@@ -3711,7 +4209,7 @@ statements together in the body of an @code{if}, @code{while}, @code{do}
or @code{for} statement.
@item
-Input control, using the @code{getline} function (@pxref{Getline}),
+Input control, using the @code{getline} command (@pxref{Getline}),
and the @code{next} statement (@pxref{Next Statement}).
@item
@@ -3729,7 +4227,7 @@ to discuss how to define your own functions.
@end iftex
@node Expressions, Statements, Actions, Top
-@chapter Actions: Expressions
+@chapter Expressions as Action Statements
@cindex expression
Expressions are the basic building block of @code{awk} actions. An
@@ -3740,7 +4238,7 @@ But, beyond that, an expression can assign a new value to a variable
or a field, with an assignment operator.
An expression can serve as a statement on its own. Most other kinds of
-statement contain one or more expressions which specify data to be
+statements contain one or more expressions which specify data to be
operated on. As in other languages, expressions in @code{awk} include
variables, array references, constants, and function calls, as well as
combinations of these with various operators.
@@ -3770,7 +4268,7 @@ combinations of these with various operators.
@cindex string constants
The simplest type of expression is the @dfn{constant}, which always has
-the same value. There are three types of constant: numeric constants,
+the same value. There are three types of constants: numeric constants,
string constants, and regular expression constants.
@cindex numeric constant
@@ -3795,7 +4293,9 @@ double-quote marks. For example:
@end example
@noindent
-@c @cindex differences between @code{gawk} and @code{awk}
+@iftex
+@cindex differences between @code{gawk} and @code{awk}
+@end iftex
represents the string whose contents are @samp{parrot}. Strings in
@code{gawk} can be of any length and they can contain all the possible
8-bit ASCII characters including ASCII NUL. Other @code{awk}
@@ -3853,20 +4353,92 @@ digits between 0 and 7. For example, the code for the ASCII ESC
@item \x@var{hh@dots{}}
Represents the hexadecimal value @var{hh}, where @var{hh} are hexadecimal
digits (@samp{0} through @samp{9} and either @samp{A} through @samp{F} or
-@samp{a} through @samp{f}). Like the same construct in ANSI C, the escape
+@samp{a} through @samp{f}). Like the same construct in @sc{ANSI} C, the escape
sequence continues until the first non-hexadecimal digit is seen. However,
-using more than two hexadecimal digits produces undefined results.@refill
+using more than two hexadecimal digits produces undefined results. (The
+@samp{\x} escape sequence is not allowed in @sc{POSIX} @code{awk}.)@refill
@end table
A constant regexp is a regular expression description enclosed in
slashes, such as @code{/^beginning and end$/}. Most regexps used in
@code{awk} programs are constant, but the @samp{~} and @samp{!~}
-operators can also match computed or ``dynamic'' regexps (@pxref{Regexp
-Usage}).
+operators can also match computed or ``dynamic'' regexps (@pxref{Regexp Usage}).
+
+Constant regexps may be used like simple expressions. When a
+constant regexp is not on the right hand side of the @samp{~} or
+@samp{!~} operators, it has the same meaning as if it appeared
+in a pattern, i.e. @samp{($0 ~ /foo/)} (@pxref{Expression Patterns}).
+This means that the following two code segments:@refill
+
+@example
+if ($0 ~ /barfly/ || $0 ~ /camelot/)
+ print "found"
+@end example
+
+@noindent
+and
+
+@example
+if (/barfly/ || /camelot/)
+ print "found"
+@end example
+
+@noindent
+are exactly equivalent. One rather bizarre consequence of this rule is
+that the following boolean expression is legal, but does not do what the user
+intended:@refill
+
+@example
+if (/foo/ ~ $1) print "found foo"
+@end example
+
+This code is ``obviously'' testing @code{$1} for a match against the regexp
+@code{/foo/}. But in fact, the expression @code{(/foo/ ~ $1)} actually means
+@code{(($0 ~ /foo/) ~ $1)}. In other words, first match the input record
+against the regexp @code{/foo/}. The result will be either a 0 or a 1,
+depending upon the success or failure of the match. Then match that result
+against the first field in the record.@refill
+
+Since it is unlikely that you would ever really wish to make this kind of
+test, @code{gawk} will issue a warning when it sees this construct in
+a program.@refill
+
+Another consequence of this rule is that the assignment statement
+
+@example
+matches = /foo/
+@end example
+
+@noindent
+will assign either 0 or 1 to the variable @code{matches}, depending
+upon the contents of the current input record.
-Constant regexps are useful only with the @samp{~} and @samp{!~} operators;
-you cannot assign them to variables or print them. They are not truly
-expressions in the usual sense.
+Constant regular expressions are also used as the first argument for
+the @code{sub} and @code{gsub} functions (@pxref{String Functions}).
+
+This feature of the language was never well documented until the
+@sc{POSIX} specification.
+
+You may be wondering, when is
+
+@example
+$1 ~ /foo/ @{ @dots{} @}
+@end example
+
+@noindent
+preferable to
+
+@example
+$1 ~ "foo" @{ @dots{} @}
+@end example
+
+Since the right-hand sides of both @samp{~} operators are constants,
+it is more efficient to use the @samp{/foo/} form: @code{awk} can note
+that you have supplied a regexp and store it internally in a form that
+makes pattern matching more efficient. In the second form, @code{awk}
+must first convert the string into this internal form, and then perform
+the pattern matching. The first form is also better ``style;'' it is
+clear that you intend a regexp match.
@node Variables, Arithmetic Ops, Constants, Expressions
@section Variables
@@ -3928,7 +4500,7 @@ If you precede the assignment with the @samp{-v} option, like this:
@noindent
then the variable is set at the very beginning, before even the
@code{BEGIN} rules are run. The @samp{-v} option and its assignment
-must precede all the file name arguments.
+must precede all the file name arguments, as well as the program text.
Otherwise, the variable assignment is performed at a time determined by
its position among the input file arguments: after the processing of the
@@ -3947,8 +4519,11 @@ but before the second file is started, @code{n} is set to 2, so that the
second field is printed in lines from @file{BBS-list}.
Command line arguments are made available for explicit examination by
-the @code{awk} program in an array named @code{ARGV} (@pxref{Built-in
-Variables}).
+the @code{awk} program in an array named @code{ARGV}
+(@pxref{Built-in Variables}).@refill
+
+@code{awk} processes the values of command line assignments for escape
+sequences (@pxref{Constants}).
@node Arithmetic Ops, Concatenation, Variables, Expressions
@section Arithmetic Operators
@@ -3984,6 +4559,9 @@ Subtraction.
@item - @var{x}
Negation.
+@item + @var{x}
+Unary plus. No real effect on the expression.
+
@item @var{x} * @var{y}
Multiplication.
@@ -3993,17 +4571,19 @@ floating point, the result is not rounded to an integer: @code{3 / 4}
has the value 0.75.
@item @var{x} % @var{y}
-@c @cindex differences between @code{gawk} and @code{awk}
+@iftex
+@cindex differences between @code{gawk} and @code{awk}
+@end iftex
Remainder. The quotient is rounded toward zero to an integer,
multiplied by @var{y} and this result is subtracted from @var{x}.
-This operation is sometimes known as ``trunc-mod''. The following
+This operation is sometimes known as ``trunc-mod.'' The following
relation always holds:
@example
b * int(a / b) + (a % b) == a
@end example
-One undesirable effect of this definition of remainder is that
+One possibly undesirable effect of this definition of remainder is that
@code{@var{x} % @var{y}} is negative if @var{x} is negative. Thus,
@example
@@ -4017,7 +4597,8 @@ may be machine dependent.
@itemx @var{x} ** @var{y}
Exponentiation: @var{x} raised to the @var{y} power. @code{2 ^ 3} has
the value 8. The character sequence @samp{**} is equivalent to
-@samp{^}.
+@samp{^}. (The @sc{POSIX} standard only specifies the use of @samp{^}
+for exponentiation.)
@end table
@node Concatenation, Comparison Ops, Arithmetic Ops, Expressions
@@ -4127,12 +4708,64 @@ True if array @var{array} has an element with the subscript @var{subscript}.
Comparison expressions have the value 1 if true and 0 if false.
-The operands of a relational operator are compared as numbers if they
-are both numbers. Otherwise they are converted to, and compared as,
-strings (@pxref{Conversion}). Strings are compared by comparing the
-first character of each, then the second character of each, and so on.
-Thus, @code{"10"} is less than @code{"9"}.
+The rules @code{gawk} uses for performing comparisons are based on those
+in draft 11.1 of the @sc{POSIX} standard. The @sc{POSIX} standard introduced
+the concept of a @dfn{numeric string}, which is simply a string that looks
+like a number, for example, @code{@w{" +2"}}.
+
+@vindex CONVFMT
+When performing a relational operation, @code{gawk} considers the type of an
+operand to be the type it received on its last @emph{assignment}, rather
+than the type of its last @emph{use}. If one operand of a comparison is
+numeric, and the other operand is either numeric or a numeric string,
+then @code{gawk} does a numeric comparison. Otherwise, it does a string
+comparison. The numeric operand will be converted to a string using
+the value of @code{CONVFMT} (@pxref{Conversion}). Strings are compared
+by comparing the first character of each, then the second character of each,
+and so on. Thus @code{"10"} is less than @code{"9"}. If there are two
+strings where one is a prefix of the other, the shorter string is less than
+the longer one. Thus @code{"abc"} is less than @code{"abcd"}.@refill
+
+Here are some sample expressions, how @code{gawk} compares them, and what
+the result of the comparison is.
+
+@table @code
+@item 1.5 <= 2.0
+numeric comparison (true)
+
+@item "abc" >= "xyz"
+string comparison (false)
+
+@item 1.5 != " +2"
+numeric comparison (true)
+
+@item "1e2" < "3"
+string comparison (true)
+@item a = 2; b = "2"
+@itemx a == b
+numeric comparison (true)
+@end table
+
+It is important to note that the concept of ``numeric string'' applies
+only to constants in the @code{awk} program source. Input data is somewhat
+different. In reality, all input data to @code{awk} is character data
+(as opposed to binary data). However, @code{awk} interprets characters in
+the input data that look like numbers @emph{as numbers}, and not as numeric
+strings. Thus,
+
+@example
+echo 1e2 3 | awk '@{ print ($1 < $2) ? "true" : "false" @}'
+@end example
+
+@noindent
+prints @samp{false}.
+
+The purpose of the comparison rules and the use of numeric strings is
+to attempt to produce the behavior that is ``least surprising,'' while
+still ``doing the right thing.''
+
+String comparisons and regular expression comparisons are very different.
For example,
@example
@@ -4148,7 +4781,7 @@ $1 ~ /foo/
@end example
@noindent
-has the value 1 if the first field contains @samp{foo}.
+has the value 1 if the first field contains @samp{foo}, such as @samp{foobar}.
The right hand operand of the @samp{~} and @samp{!~} operators may be
either a constant regexp (@code{/@dots{}/}), or it may be an ordinary
@@ -4171,7 +4804,7 @@ regexp to avoid confusing the @code{gawk} parser. For example,
One special place where @code{/foo/} is @emph{not} an abbreviation for
@code{$0 ~ /foo/} is when it is the right-hand operand of @samp{~} or
-@samp{!~}!
+@samp{!~}! @xref{Constants}, where this is discussed in more detail.
@node Boolean Ops, Assignment Ops, Comparison Ops, Expressions
@section Boolean Expressions
@@ -4191,10 +4824,10 @@ parentheses to control nesting. The truth of the boolean expression is
computed by combining the truth values of the component expressions.
Boolean expressions can be used wherever comparison and matching
-expressions can be used. They can be used in @code{if} and @code{while}
-statements. They have numeric values (1 if true, 0 if false), which
-come into place if the result of the boolean expression is stored in a
-variable, or used in arithmetic.
+expressions can be used. They can be used in @code{if}, @code{while}
+@code{do} and @code{for} statements. They have numeric values (1 if true,
+0 if false), which come into play if the result of the boolean expression
+is stored in a variable, or used in arithmetic.@refill
In addition, every boolean expression is also a valid boolean pattern, so
you can use it as a pattern to control the execution of rules.
@@ -4286,13 +4919,13 @@ makes itself felt through the alteration of the variable. We call this
a @dfn{side effect}.
@cindex lvalue
-The left-hand operand of an assignment need not be a variable
-(@pxref{Variables}); it can also be a field (@pxref{Changing Fields}) or
+The left-hand operand of an assignment need not be a variable (@pxref{Variables});
+it can also be a field (@pxref{Changing Fields}) or
an array element (@pxref{Arrays}). These are all called @dfn{lvalues},
which means they can appear on the left-hand side of an assignment operator.
The right-hand operand may be any expression; it produces the new value
which the assignment stores in the specified variable, field or array
-element.
+element.@refill
It is important to note that variables do @emph{not} have permanent types.
The type of a variable is simply the type of whatever value it happens
@@ -4374,8 +5007,25 @@ Sets @var{lvalue} to its remainder by @var{modulus}.
@item @var{lvalue} ^= @var{power}
@itemx @var{lvalue} **= @var{power}
Raises @var{lvalue} to the power @var{power}.
+(Only the @code{^=} operator is specified by @sc{POSIX}.)
@end table
+@ignore
+From: gatech!ames!elroy!cit-vax!EQL.Caltech.Edu!rankin (Pat Rankin)
+ In the discussion of assignment operators, it states that
+``foo += 5'' "is precisely equivalent to" ``foo = foo + 5'' (p.77). That
+may be true for simple variables, but it's not true for expressions with
+side effects, like array references. For proof, try
+ BEGIN {
+ foo[rand()] += 5; for (x in foo) print x, foo[x]
+ bar[rand()] = bar[rand()] + 5; for (x in bar) print x, bar[x]
+ }
+I suspect that the original statement is simply untrue--that '+=' is more
+efficient in all cases.
+
+ADR --- Try to add something about this here for the next go 'round.
+@end ignore
+
@node Increment Ops, Conversion, Assignment Ops, Expressions
@section Increment Operators
@@ -4397,7 +5047,7 @@ equivalent.
Writing the @samp{++} after the variable specifies post-increment. This
increments the variable value just the same; the difference is that the
value of the increment expression itself is the variable's @emph{old}
-value. Thus, if @code{foo} has value 4, then the expression @code{foo++}
+value. Thus, if @code{foo} has the value 4, then the expression @code{foo++}
has the value 4, but it changes the value of @code{foo} to 5.
The post-increment @code{foo++} is nearly equivalent to writing @code{(foo
@@ -4407,7 +5057,10 @@ not necessarily equal @code{foo}. But the difference is minute as
long as you stick to numbers that are fairly small (less than a trillion).
Any lvalue can be incremented. Fields and array elements are incremented
-just like variables.
+just like variables. (Use @samp{$(i++)} when you wish to do a field reference
+and a variable increment at the same time. The parentheses are necessary
+because of the precedence of the field reference operator, @samp{$}.)
+@c expert information in the last parenthetical remark
The decrement operator @samp{--} works just like @samp{++} except that
it subtracts 1 instead of adding. Like @samp{++}, it can be used before
@@ -4465,23 +5118,56 @@ Strings are converted to numbers by interpreting them as numerals:
Strings that can't be interpreted as valid numbers are converted to
zero.
-@vindex OFMT
+@vindex CONVFMT
The exact manner in which numbers are converted into strings is controlled
-by the @code{awk} built-in variable @code{OFMT} (@pxref{Built-in Variables}).
+by the @code{awk} built-in variable @code{CONVFMT} (@pxref{Built-in Variables}).
Numbers are converted using a special
-version of the @code{sprintf} function (@pxref{Built-in}) with @code{OFMT}
+version of the @code{sprintf} function (@pxref{Built-in}) with @code{CONVFMT}
as the format specifier.@refill
-@code{OFMT}'s default value is @code{"%.6g"}, which prints a value with
+@code{CONVFMT}'s default value is @code{"%.6g"}, which prints a value with
at least six significant digits. For some applications you will want to
change it to specify more precision. Double precision on most modern
machines gives you 16 or 17 decimal digits of precision.
-Strange results can happen if you set @code{OFMT} to a string that doesn't
+Strange results can happen if you set @code{CONVFMT} to a string that doesn't
tell @code{sprintf} how to format floating point numbers in a useful way.
For example, if you forget the @samp{%} in the format, all numbers will be
converted to the same constant string.@refill
+As a special case, if a number is an integer, then the result of converting
+it to a string is @emph{always} an integer, no matter what the value of
+@code{CONVFMT} may be. Given the following code fragment:
+
+@example
+CONVFMT = "%2.2f"
+a = 12
+b = a ""
+@end example
+
+@noindent
+@code{b} has the value @code{"12"}, not @code{"12.00"}.
+
+@ignore
+For the 2.14 version, describe the ``stickyness'' of conversions. Right now
+the manual assumes everywhere that variables are either numbers or strings;
+in fact both kinds of values may be valid. If both happen to be valid, a
+conversion isn't necessary and isn't done. Revising the manual to be
+consistent with this, though, is too big a job to tackle at the moment.
+@end ignore
+
+@vindex OFMT
+Prior to the @sc{POSIX} standard, @code{awk} specified that the value
+of @code{OFMT} was used for converting numbers to strings. @code{OFMT}
+specifies the output format to use when printing numbers with @code{print}.
+@code{CONVFMT} was introduced in order to separate the semantics of
+conversions from the semantics of printing. Both @code{CONVFMT} and
+@code{OFMT} have the same default value: @code{"%.6g"}. In the vast majority
+of cases, old @code{awk} programs will not change their behavior.
+However, this use of @code{OFMT} is something to keep in mind if you must
+port your program to other implementations of @code{awk}; we recommend
+that instead of changing your programs, you just port @code{gawk} itself!@refill
+
@node Conditional Exp, Function Calls, Conversion, Expressions
@section Conditional Expressions
@cindex conditional expression
@@ -4499,10 +5185,10 @@ The conditional expression looks the same as in the C language:
@noindent
There are three subexpressions. The first, @var{selector}, is always
-computed first. If it is ``true'' (not zero) then @var{if-true-exp} is
-computed next and its value becomes the value of the whole expression.
-Otherwise, @var{if-false-exp} is computed next and its value becomes the
-value of the whole expression.
+computed first. If it is ``true'' (not zero and not null) then
+@var{if-true-exp} is computed next and its value becomes the value of
+the whole expression. Otherwise, @var{if-false-exp} is computed next
+and its value becomes the value of the whole expression.@refill
For example, this expression produces the absolute value of @code{x}:
@@ -4534,7 +5220,7 @@ A @dfn{function} is a name for a particular calculation. Because it has
a name, you can ask for it by name at any point in the program. For
example, the function @code{sqrt} computes the square root of a number.
-A fixed set of functions are @dfn{built in}, which means they are
+A fixed set of functions are @dfn{built-in}, which means they are
available in every @code{awk} program. The @code{sqrt} function is one
of these. @xref{Built-in}, for a list of built-in functions and their
descriptions. In addition, you can define your own functions in the
@@ -4551,7 +5237,7 @@ there are no arguments, write just @samp{()} after the function name.
Here are some examples:
@example
-sqrt(x**2 + y**2) # @r{One argument}
+sqrt(x^2 + y^2) # @r{One argument}
atan2(y, x) # @r{Two arguments}
rand() # @r{No arguments}
@end example
@@ -4592,7 +5278,7 @@ awk '@{ print "The square root of", $1, "is", sqrt($1) @}'
@end example
@node Precedence,, Function Calls, Expressions
-@section Operator Precedence: How Operators Nest
+@section Operator Precedence (How Operators Nest)
@cindex precedence
@cindex operator precedence
@@ -4600,10 +5286,10 @@ awk '@{ print "The square root of", $1, "is", sqrt($1) @}'
different operators appear close by in one expression. For example,
@samp{*} has higher precedence than @samp{+}; thus, @code{a + b * c}
means to multiply @code{b} and @code{c}, and then add @code{a} to the
-product.
+product (i.e., @code{a + (b * c)}).
-You can overrule the precedence of the operators by writing parentheses
-yourself. You can think of the precedence rules as saying where the
+You can overrule the precedence of the operators by using parentheses.
+You can think of the precedence rules as saying where the
parentheses are assumed if you do not write parentheses yourself. In
fact, it is wise always to use parentheses whenever you have an unusual
combination of operators, because other people who read the program may
@@ -4613,17 +5299,17 @@ any such mistake.
When operators of equal precedence are used together, the leftmost
operator groups first, except for the assignment, conditional and
-and exponentiation operators, which group in the opposite order.
+exponentiation operators, which group in the opposite order.
Thus, @code{a - b + c} groups as @code{(a - b) + c};
-@code{a = b = c} groups as @code{a = (b = c)}.
+@code{a = b = c} groups as @code{a = (b = c)}.@refill
The precedence of prefix unary operators does not matter as long as only
unary operators are involved, because there is only one way to parse
them---innermost first. Thus, @code{$++i} means @code{$(++i)} and
@code{++$x} means @code{++($x)}. However, when another operator follows
the operand, then the precedence of the unary operators can matter.
-Thus, @code{$x**2} means @code{($x)**2}, but @code{-x**2} means
-@code{-(x**2)}, because @samp{-} has lower precedence than @samp{**}
+Thus, @code{$x^2} means @code{($x)^2}, but @code{-x^2} means
+@code{-(x^2)}, because @samp{-} has lower precedence than @samp{^}
while @samp{$} has higher precedence.
Here is a table of the operators of @code{awk}, in order of increasing
@@ -4633,6 +5319,7 @@ precedence:
@item assignment
@samp{=}, @samp{+=}, @samp{-=}, @samp{*=}, @samp{/=}, @samp{%=},
@samp{^=}, @samp{**=}. These operators group right-to-left.
+(The @samp{**=} operator is not specified by @sc{POSIX}.)
@item conditional
@samp{?:}. These operators group right-to-left.
@@ -4644,7 +5331,7 @@ precedence:
@samp{&&}.
@item array membership
-@code{in}.
+@samp{in}.
@item matching
@samp{~}, @samp{!~}.
@@ -4668,16 +5355,9 @@ redirection operator near another operator of lower precedence, without
parentheses. Such combinations, for example @samp{print foo > a ? b :
c}, result in syntax errors.
-@item concatentation
+@item concatenation
No special token is used to indicate concatenation.
The operands are simply written side by side.
-@c This is supposedly being fixed
-@ignore
-Concatenation has the same precedence as relational and redirection
-operators. These operators nest left to right. Thus, @code{4 5 > 6}
-concatenates first, yielding 1, while @code{6 < 4 5} compares first, and
-yields @code{"05"}.
-@end ignore
@item add, subtract
@samp{+}, @samp{-}.
@@ -4690,6 +5370,7 @@ yields @code{"05"}.
@item exponentiation
@samp{^}, @samp{**}. These operators group right-to-left.
+(The @samp{**} operator is not specified by @sc{POSIX}.)
@item increment, decrement
@samp{++}, @samp{--}.
@@ -4699,7 +5380,7 @@ yields @code{"05"}.
@end table
@node Statements, Arrays, Expressions, Top
-@chapter Actions: Control Statements
+@chapter Control Statements in Actions
@cindex control statement
@dfn{Control statements} such as @code{if}, @code{while}, and so on
@@ -4949,11 +5630,12 @@ for (i = 1; i <= 100; i *= 2)
print i
@end example
-Any of the three expressions in the parentheses following @code{for} may
+Any of the three expressions in the parentheses following the @code{for} may
be omitted if there is nothing to be done there. Thus, @w{@samp{for (;x
> 0;)}} is equivalent to @w{@samp{while (x > 0)}}. If the
@var{condition} is omitted, it is treated as @var{true}, effectively
-yielding an infinite loop.@refill
+yielding an @dfn{infinite loop} (i.e., a loop that will never
+terminate).@refill
In most cases, a @code{for} loop is an abbreviation for a @code{while}
loop, as shown here:
@@ -5079,6 +5761,18 @@ for (x in names)
print names[x]
@end example
+@ignore
+from brennan@boeing.com:
+
+page 90, section 9.6. The example is too artificial as
+the one line program
+
+ !/ignore/
+
+does the same thing.
+@end ignore
+@c ADR --- he's right, but don't worry about this for now
+
The @code{continue} statement in a @code{for} loop directs @code{awk} to
skip the rest of the body of the loop, and resume execution with the
increment-expression of the @code{for} statement. The following program
@@ -5131,7 +5825,7 @@ Contrast this with the effect of the @code{getline} function
immediately, but it does not alter the flow of control in any way. So
the rest of the current action executes with a new input record.
-At the grossest level, @code{awk} program execution is a loop that reads
+At the highest level, @code{awk} program execution is a loop that reads
an input record and then tests each rule's pattern against it. If you
think of this loop as a @code{for} statement whose body contains the
rules, then the @code{next} statement is analogous to a @code{continue}
@@ -5154,8 +5848,13 @@ so that the following rules will not see the bad record. The error
message is redirected to the standard error output stream, as error
messages should be. @xref{Special Files}.
-The @code{next} statement is not allowed in a @code{BEGIN} or @code{END}
-rule.
+According to the @sc{POSIX} standard, the behavior is undefined if
+the @code{next} statement is used in a @code{BEGIN} or @code{END} rule.
+@code{gawk} will treat it as a syntax error.
+
+If the @code{next} statement causes the end of the input to be reached,
+then the code in the @code{END} rules, if any, will be executed.
+@ref{BEGIN/END}.
@node Exit Statement, , Next Statement, Statements
@section The @code{exit} Statement
@@ -5173,7 +5872,7 @@ read. However, if an @code{END} rule is present, it is executed
If @code{exit} is used as part of an @code{END} rule, it causes
the program to stop immediately.
-An @code{exit} statement that is part an ordinary rule (that is, not part
+An @code{exit} statement that is part of an ordinary rule (that is, not part
of a @code{BEGIN} or @code{END} rule) stops the execution of any further
automatic rules, but the @code{END} rule is executed if there is one.
If you don't want the @code{END} rule to do its job in this case, you
@@ -5202,7 +5901,7 @@ BEGIN @{
@node Arrays, Built-in, Statements, Top
@chapter Arrays in @code{awk}
-An @dfn{array} is a table of various values, called @dfn{elements}. The
+An @dfn{array} is a table of values, called @dfn{elements}. The
elements of an array are distinguished by their @dfn{indices}. Indices
may be either numbers or strings. Each array has a name, which looks
like a variable name, but must not be in use as a variable name in the
@@ -5219,6 +5918,8 @@ same @code{awk} program.
* Delete:: The @code{delete} statement removes an element from an array.
+* Numeric Array Subscripts:: How to use numbers as subscripts in @code{awk}.
+
* Multi-dimensional:: Emulating multi-dimensional arrays in @code{awk}.
* Multi-scanning:: Scanning multi-dimensional arrays.
@end menu
@@ -5238,11 +5939,11 @@ as a variable) in one @code{awk} program.
Arrays in @code{awk} superficially resemble arrays in other programming
languages; but there are fundamental differences. In @code{awk}, you
don't need to specify the size of an array before you start to use it.
-What's more, in @code{awk} any number or even a string may be used as an
+What's more, in @code{awk} any number or string may be used as an
array index.
In most other languages, you have to @dfn{declare} an array and specify
-how many elements or components it has. In such languages, the
+how many elements or components it contains. In such languages, the
declaration causes a contiguous block of memory to be allocated for that
many elements. An index in the array must be a positive integer; for
example, the index 0 specifies the first element in the array, which is
@@ -5280,11 +5981,10 @@ array element value:
@end example
@noindent
-We have shown the pairs in jumbled order because their order doesn't
-mean anything.
+We have shown the pairs in jumbled order because their order is irrelevant.
One advantage of an associative array is that new pairs can be added
-at any time. For example, suppose we add to that array a tenth element
+at any time. For example, suppose we add to the above array a tenth element
whose value is @w{@code{"number ten"}}. The result is this:
@example
@@ -5297,7 +5997,7 @@ whose value is @w{@code{"number ten"}}. The result is this:
@noindent
Now the array is @dfn{sparse} (i.e., some indices are missing): it has
-elements 4 and 10, but doesn't have elements 5, 6, 7, 8, or 9.@refill
+elements 1--4 and 10, but doesn't have elements 5, 6, 7, 8, or 9.@refill
Another consequence of associative arrays is that the indices don't
have to be positive integers. Any number, or even a string, can be
@@ -5421,13 +6121,6 @@ END @{
@}
@end example
-@ignore
-The first rule just initializes the variable @code{max}. (This is not
-strictly necessary, since an uninitialized variable has the null string
-as its value, and the null string is effectively zero when used in
-a context where a number is required.)
-@end ignore
-
The first rule keeps track of the largest line number seen so far;
it also stores each line into the array @code{arr}, at an index that
is the line's number.
@@ -5513,7 +6206,6 @@ function @code{length}.
# Find number of distinct words more than 10 characters long.
END @{
- num_long_words = 0
for (x in used)
if (length(x) > 10) @{
++num_long_words
@@ -5534,7 +6226,7 @@ problems if new elements are added to @var{array} by statements in
reach them. Similarly, changing @var{var} inside the loop can produce
strange results. It is best to avoid such things.@refill
-@node Delete, Multi-dimensional, Scanning an Array, Arrays
+@node Delete, Numeric Array Subscripts, Scanning an Array, Arrays
@section The @code{delete} Statement
@cindex @code{delete} statement
@cindex deleting elements of arrays
@@ -5572,7 +6264,61 @@ if (4 in foo)
print "This will never be printed"
@end example
-@node Multi-dimensional, Multi-scanning, Delete, Arrays
+It is not an error to delete an element which does not exist.
+
+@node Numeric Array Subscripts, Multi-dimensional, Delete, Arrays
+@section Using Numbers to Subscript Arrays
+
+An important aspect of arrays to remember is that array subscripts
+are @emph{always} strings. If you use a numeric value as a subscript,
+it will be converted to a string value before it is used for subscripting
+(@pxref{Conversion}).
+
+@cindex conversions, during subscripting
+@cindex numbers, used as subscripts
+@vindex CONVFMT
+This means that the value of the @code{CONVFMT} can potentially
+affect how your program accesses elements of an array. For example:
+
+@example
+a = b = 12.153
+data[a] = 1
+CONVFMT = "%2.2f"
+if (b in data)
+ printf "%s is in data", b
+else
+ printf "%s is not in data", b
+@end example
+
+@noindent
+should print @samp{12.15 is not in data}. The first statement gives
+both @code{a} and @code{b} the same numeric value. Assigning to
+@code{data[a]} first gives @code{a} the string value @code{"12.153"}
+(using the default conversion value of @code{CONVFMT}, @code{"%.6g"}),
+and then assigns 1 to @code{data["12.153"]}. The program then changes
+the value of @code{CONVFMT}. The test @samp{(b in data)} forces @code{b}
+to be converted to a string, this time @code{"12.15"}, since the value of
+@code{CONVFMT} only allows two significant digits. This test fails,
+since @code{"12.15"} is a different string from @code{"12.153"}.@refill
+
+Following the rules for conversions (@pxref{Conversion}), integer
+values are always converted to strings as integers, no matter what the
+value of @code{CONVFMT} may happen to be. So the usual case of
+
+@example
+for (i = 1; i <= maxsub; i++)
+ @i{do something with} array[i]
+@end example
+
+@noindent
+will work, no matter what the value of @code{CONVFMT}.
+
+Like many things in @code{awk}, the majority of the time things work
+as you would expect them to work. But it is useful to have a precise
+knowledge of the actual rules, since sometimes they can have a subtle
+effect on your programs.
+
+@node Multi-dimensional, Multi-scanning, Numeric Array Subscripts, Arrays
@section Multi-dimensional Arrays
@cindex subscripts, multi-dimensional in arrays
@@ -5597,15 +6343,16 @@ variable @code{SUBSEP}.
For example, suppose we evaluate the expression @code{foo[5,12]="value"}
when the value of @code{SUBSEP} is @code{"@@"}. The numbers 5 and 12 are
+converted to strings and
concatenated with a comma between them, yielding @code{"5@@12"}; thus,
-the array element @code{foo["5@@12"]} is set to @code{"value"}.
+the array element @code{foo["5@@12"]} is set to @code{"value"}.@refill
Once the element's value is stored, @code{awk} has no record of whether
it was stored with a single index or a sequence of indices. The two
expressions @code{foo[5,12]} and @w{@code{foo[5 SUBSEP 12]}} always have
the same value.
-The default value of @code{SUBSEP} is actually the string @code{"\034"},
+The default value of @code{SUBSEP} is the string @code{"\034"},
which contains a nonprinting character that is unlikely to appear in an
@code{awk} program or in the input data.
@@ -5615,7 +6362,7 @@ combined strings that are ambiguous. Suppose that @code{SUBSEP} were
@code{"@@"}; then @w{@code{foo["a@@b", "c"]}} and @w{@code{foo["a",
"b@@c"]}} would be indistinguishable because both would actually be
stored as @code{foo["a@@b@@c"]}. Because @code{SUBSEP} is
-@code{"\034"}, such confusion can actually happen only when an index
+@code{"\034"}, such confusion can arise only when an index
contains the character with ASCII code 034, which is a rare
event.@refill
@@ -5737,7 +6484,9 @@ new functions yourself. @xref{User-defined}.)
* String Functions:: Functions for string manipulation,
such as @code{split}, @code{match}, and @code{sprintf}.
-* I/O Functions:: Functions for files and shell commands
+* I/O Functions:: Functions for files and shell commands.
+
+* Time Functions:: Functions for dealing with time stamps.
@end menu
@node Calling Built-in, Numeric Functions, Built-in, Built-in
@@ -5804,8 +6553,7 @@ This gives you the sine of @var{x}, with @var{x} in radians.
This gives you the cosine of @var{x}, with @var{x} in radians.
@item atan2(@var{y}, @var{x})
-This gives you the arctangent of @code{@var{y} / @var{x}}, with the
-quotient understood in radians.
+This gives you the arctangent of @code{@var{y} / @var{x}} in radians.
@item rand()
This gives you a random number. The values of @code{rand} are
@@ -5827,7 +6575,8 @@ than @var{n}. We then make it an integer (using @code{int}) between 0
and @code{@var{n} @minus{} 1}.
Here is an example where a similar function is used to produce
-random integers between 1 and @var{n}:
+random integers between 1 and @var{n}. Note that this program will
+print a new random number for each input record.
@example
awk '
@@ -5864,29 +6613,20 @@ numbers that are truly unpredictable.
The return value of @code{srand} is the previous seed. This makes it
easy to keep track of the seeds for use in consistently reproducing
sequences of random numbers.
-
-@item time()
-The function @code{time} (not in all versions of @code{awk}) returns the
-current time in seconds since January 1, 1970.
-
-@item ctime(@var{then})
-The function @code{ctime} (not in all versions of @code{awk}) takes an numeric
-argument in seconds and returns a string representing the corresponding date,
-suitable for printing or further processing.
@end table
@node String Functions, I/O Functions, Numeric Functions, Built-in
@section Built-in Functions for String Manipulation
- The functions in this section look at the text of one or more
+The functions in this section look at or change the text of one or more
strings.
@table @code
@item index(@var{in}, @var{find})
@findex match
This searches the string @var{in} for the first occurrence of the string
-@var{find}, and returns the position where that occurrence begins in the
-string @var{in}. For example:@refill
+@var{find}, and returns the position in characters where that occurrence
+begins in the string @var{in}. For example:@refill
@example
awk 'BEGIN @{ print index("peanut", "an") @}'
@@ -5894,6 +6634,7 @@ awk 'BEGIN @{ print index("peanut", "an") @}'
@noindent
prints @samp{3}. If @var{find} is not found, @code{index} returns 0.
+(Remember that string indices in @code{awk} start at 1.)
@item length(@var{string})
@findex length
@@ -5906,6 +6647,11 @@ three characters.
If no argument is supplied, @code{length} returns the length of @code{$0}.
+In older versions of @code{awk}, you could call the @code{length} function
+without any parentheses. However, this is not allowed by the @sc{POSIX}
+specification, and for maximal portability of your @code{awk} programs
+you should always supply the parentheses.
+
@item match(@var{string}, @var{regexp})
@findex match
The @code{match} function searches the string, @var{string}, for the
@@ -5918,8 +6664,8 @@ where that substring begins (1, if it starts at the beginning of
@vindex RLENGTH
The @code{match} function sets the built-in variable @code{RSTART} to
the index. It also sets the built-in variable @code{RLENGTH} to the
-length of the matched substring. If no match is found, @code{RSTART}
-is set to 0, and @code{RLENGTH} to @minus{}1.
+length in characters of the matched substring. If no match is found,
+@code{RSTART} is set to 0, and @code{RLENGTH} to @minus{}1.
For example:
@@ -5964,8 +6710,9 @@ Match of Melvin found at 26 in This file created by Melvin.
This divides @var{string} up into pieces separated by @var{fieldsep},
and stores the pieces in @var{array}. The first piece is stored in
@code{@var{array}[1]}, the second piece in @code{@var{array}[2]}, and so
-forth. The string value of the third argument, @var{fieldsep}, is used
-as a regexp to search for to find the places to split @var{string}. If
+forth. The string value of the third argument, @var{fieldsep}, is
+a regexp describing where to split @var{string} (much as @code{FS} can
+be a regexp describing where to split input records). If
the @var{fieldsep} is omitted, the value of @code{FS} is used.
@code{split} returns the number of elements created.@refill
@@ -5989,6 +6736,10 @@ a[3] = "fe"
@noindent
The value returned by this call to @code{split} is 3.
+As with input field-splitting, when the value of @var{fieldsep} is
+@code{" "}, leading and trailing whitespace is ignored, and the elements
+are separated by runs of whitespace.
+
@item sprintf(@var{format}, @var{expression1},@dots{})
@findex sprintf
This returns (without printing) the string that @code{printf} would
@@ -6044,7 +6795,21 @@ awk '@{ sub(/candidate/, "& and his wife"); print @}'
changes the first occurrence of @samp{candidate} to @samp{candidate
and his wife} on each input line.
-The effect of this special character can be turned off by putting a
+Here is another example:
+
+@example
+awk 'BEGIN @{
+ str = "daabaaa"
+ sub(/a*/, "c&c", str)
+ print str
+@}'
+@end example
+
+@noindent
+prints @samp{dcaacbaaa}. This show how @samp{&} can represent a non-constant
+string, and also illustrates the ``leftmost, longest'' rule.
+
+The effect of this special character (@samp{&}) can be turned off by putting a
backslash before it in the string. As usual, to insert one backslash in
the string, you must write two backslashes. Therefore, write @samp{\\&}
in a string constant to include a literal @samp{&} in the replacement.
@@ -6075,7 +6840,7 @@ But that is considered erroneous in @code{gawk}.
This is similar to the @code{sub} function, except @code{gsub} replaces
@emph{all} of the longest, leftmost, @emph{nonoverlapping} matching
substrings it can find. The @samp{g} in @code{gsub} stands for
-``global'', which means replace everywhere. For example:@refill
+``global,'' which means replace everywhere. For example:@refill
@example
awk '@{ gsub(/Britain/, "United Kingdom"); print @}'
@@ -6101,7 +6866,9 @@ string is character number one. For example,
If @var{length} is not present, this function returns the whole suffix of
@var{string} that begins at character number @var{start}. For example,
-@code{substr("washington", 5)} returns @code{"ington"}.
+@code{substr("washington", 5)} returns @code{"ington"}. This is also
+the case if @var{length} is greater than the number of characters remaining
+in the string, counting from character number @var{start}.
@item tolower(@var{string})
@findex tolower
@@ -6118,7 +6885,7 @@ Nonalphabetic characters are left unchanged. For example,
@code{toupper("MiXeD cAsE 123")} returns @code{"MIXED CASE 123"}.
@end table
-@node I/O Functions, , String Functions, Built-in
+@node I/O Functions, Time Functions, String Functions, Built-in
@section Built-in Functions For Input/Output
@table @code
@@ -6160,6 +6927,253 @@ Some operating systems cannot implement the @code{system} function.
@code{system} causes a fatal error if it is not supported.
@end table
+@node Time Functions, , I/O Functions, Built-in
+@section Functions For Dealing With Time Stamps
+
+@cindex time stamps
+@cindex time of day
+A common use for @code{awk} programs is the processing of log files.
+Log files often contain time stamp information, indicating when a
+particular log record was written. Many programs log their time stamp
+in the form returned by the @code{time} system call, which is the
+number of seconds since a particular epoch. On @sc{POSIX} systems,
+it is the number of seconds since Midnight, January 1, 1970, UTC.
+
+In order to make it easier to process such log files, and to easily produce
+useful reports, @code{gawk} provides two functions for working with time
+stamps. Both of these are @code{gawk} extensions; they are not specified
+in the @sc{POSIX} standard, nor are they in any other known version
+of @code{awk}.
+
+@table @code
+@item systime()
+@findex systime
+This function returns the current time as the number of seconds since
+the system epoch. On @sc{POSIX} systems, this is the number of seconds
+since Midnight, January 1, 1970, UTC. It may be a different number on
+other systems.
+
+@item strftime(@var{format}, @var{timestamp})
+@findex strftime
+This function returns a string. It is similar to the function of the
+same name in the @sc{ANSI} C standard library. The time specified by
+@var{timestamp} is used to produce a string, based on the contents
+of the @var{format} string.
+@end table
+
+The @code{systime} function allows you to compare a time stamp from a
+log file with the current time of day. In particular, it is easy to
+determine how long ago a particular record was logged. It also allows
+you to produce log records using the ``seconds since the epoch'' format.
+
+The @code{strftime} function allows you to easily turn a time stamp
+into human-readable information. It is similar in nature to the @code{sprintf}
+function, copying non-format specification characters verbatim to the
+returned string, and substituting date and time values for format
+specifications in the @var{format} string. If no @var{timestamp} argument
+is supplied, @code{gawk} will use the current time of day as the
+time stamp.@refill
+
+@code{strftime} is guaranteed by the @sc{ANSI} C standard to support
+the following date format specifications:
+
+@table @code
+@item %a
+The locale's abbreviated weekday name.
+
+@item %A
+The locale's full weekday name.
+
+@item %b
+The locale's abbreviated month name.
+
+@item %B
+The locale's full month name.
+
+@item %c
+The locale's ``appropriate'' date and time representation.
+
+@item %d
+The day of the month as a decimal number (01--31).
+
+@item %H
+The hour (24-hour clock) as a decimal number (00--23).
+
+@item %I
+The hour (12-hour clock) as a decimal number (01--12).
+
+@item %j
+The day of the year as a decimal number (001--366).
+
+@item %m
+The month as a decimal number (01--12).
+
+@item %M
+The minute as a decimal number (00--59).
+
+@item %p
+The locale's equivalent of the AM/PM designations associated
+with a 12-hour clock.
+
+@item %S
+The second as a decimal number (00--61). (Occasionally there are
+minutes in a year with one or two leap seconds, which is why the
+seconds can go from 0 all the way to 61.)
+
+@item %U
+The week number of the year (the first Sunday as the first day of week 1)
+as a decimal number (00--53).
+
+@item %w
+The weekday as a decimal number (0--6). Sunday is day 0.
+
+@item %W
+The week number of the year (the first Monday as the first day of week 1)
+as a decimal number (00--53).
+
+@item %x
+The locale's ``appropriate'' date representation.
+
+@item %X
+The locale's ``appropriate'' time representation.
+
+@item %y
+The year without century as a decimal number (00--99).
+
+@item %Y
+The year with century as a decimal number.
+
+@item %Z
+The time zone name or abbreviation, or no characters if
+no time zone is determinable.
+
+@item %%
+A literal @samp{%}.
+@end table
+
+If a conversion specifier is not one of the above, the behavior is undefined.
+@footnote{This is because the @sc{ANSI} standard for C leaves the behavior
+of the C version of @code{strftime} undefined, and @code{gawk} will use the
+system's version of @code{strftime} if it's there. Typically, the conversion
+specifier will either not appear in the returned string, or it will appear
+literally.}
+
+Informally, a @dfn{locale} is the geographic place in which a program
+is meant to run. For example, a common way to abbreviate the date
+September Fourth, 1991 in the United States would be ``9/4/91''.
+In many countries in Europe, however, it would be abbreviated ``4.9.91''.
+Thus, the @samp{%x} specification in a @code{"US"} locale might produce
+@samp{9/4/91}, while in a @code{"EUROPE"} locale, it might produce
+@samp{4.9.91}. The @sc{ANSI} C standard defines a default @code{"C"}
+locale, which is an environment that is typical of what most C programmers
+are used to.
+
+A public-domain C version of @code{strftime} is shipped with @code{gawk}
+for systems that are not yet fully @sc{ANSI}-compliant. If that version is
+used to compile @code{gawk} (@pxref{Installation}), then the following
+additional format specifications are available:
+
+@table @code
+@item %D
+Equivalent to specifying @samp{%m/%d/%y}.
+
+@item %e
+The day of the month, padded with a blank if it is only one digit.
+
+@item %h
+Equivalent to @samp{%b}, above.
+
+@item %n
+A newline character (ASCII LF).
+
+@item %r
+Equivalent to specifying @samp{%I:%M:%S %p}.
+
+@item %R
+Equivalent to specifying @samp{%H:%M}.
+
+@item %T
+Equivalent to specifying @samp{%H:%M:%S}.
+
+@item %t
+A TAB character.
+
+@item %C
+The century, as a number between 00 and 99.
+
+@item %Ec %EC %Ex %Ey %EY %Od %Oe %OH
+@itemx %OI %Om %OM %OS %OU %Ow %OW %Oy
+These are ``alternate representations'' for the specifications
+that use only the second letter (@samp{%c}, @samp{%C}, and so on).
+They are recognized, but their normal representations are used.
+(These facilitate compliance with the @sc{POSIX} @code{date}
+utility.)@refill
+
+@item %V
+The date in VMS format (e.g. 20-JUN-1991).
+@end table
+
+Here are two examples that use @code{strftime}. The first is an
+@code{awk} version of the C @code{ctime} function.
+
+@example
+# ctime.awk
+#
+# awk version of C ctime(3) function
+
+function ctime( format)
+@{
+ format = "%a %b %e %H:%M:%S %Z %Y"
+
+ return strftime(format) # defaults to current time
+@}
+@end example
+
+This next example is an @code{awk} implementation of the @sc{POSIX}
+@code{date} utility. Normally, the @code{date} utility prints the
+current date and time of day in a well known format. However, if you
+provide an argument to it that begins with a @samp{+}, @code{date}
+will copy non-format specifier characters to the standard output, and
+will interpret the current time according to the format specifiers in
+the string. For example:
+
+@example
+date '+Today is %A, %B %d, %Y.'
+@end example
+
+@noindent
+might print
+
+@example
+Today is Thursday, July 11, 1991.
+@end example
+
+Here is the @code{awk} version of the @code{date} utility.
+
+@example
+#! /usr/bin/gawk -f
+#
+# date --- implement the P1003.2 Draft 11 'date' command
+#
+# Bug: does not recognize the -u argument.
+
+BEGIN \
+@{
+ format = "%a %b %e %H:%M:%S %Z %Y"
+ exitval = 0
+
+ if (ARGC > 2)
+ exitval = 1
+ else if (ARGC == 2) @{
+ format = ARGV[1]
+ if (format ~ /^\+/)
+ format = substr(format, 2) # remove leading +
+ @}
+ print strftime(format)
+ exit exitval
+@}
+@end example
+
@node User-defined, Built-in Variables, Built-in, Top
@chapter User-defined Functions
@@ -6196,11 +7210,10 @@ function @var{name} (@var{parameter-list}) @{
@end example
@noindent
-The keyword @code{function} may be abbreviated @code{func}.
-
@var{name} is the name of the function to be defined. A valid function
name is like a valid variable name: a sequence of letters, digits and
-underscores, not starting with a digit.
+underscores, not starting with a digit. Functions share the same pool
+of names as variables and arrays.
@var{parameter-list} is a list of the function's arguments and local
variable names, separated by commas. When the function is called,
@@ -6250,6 +7263,24 @@ There is no need in @code{awk} to put the definition of a function
before all uses of the function. This is because @code{awk} reads the
entire program before starting to execute any of it.
+In many @code{awk} implementations, the keyword @code{function} may be
+abbreviated @code{func}. However, @sc{POSIX} only specifies the use of
+the keyword @code{function}. This actually has some practical implications.
+If @code{gawk} is in @sc{POSIX}-compatibility mode (@pxref{Command Line}),
+then the following statement will @emph{not} define a function:@refill
+
+@example
+func foo() @{ a = sqrt($1) ; print a @}
+@end example
+
+@noindent
+Instead it defines a rule that, for each record, concatenates the value
+of the variable @samp{func} with the return value of the function @samp{foo},
+and based on the truth value of the result, executes the corresponding action.
+This is probably not what was desired. (@code{awk} accepts this input as
+syntactically valid, since functions may be used before they are defined
+in @code{awk} programs.)
+
@node Function Example, Function Caveats, Definition Syntax, User-defined
@section Function Definition Example
@@ -6276,9 +7307,9 @@ This program prints, in our special format, all the third fields that
contain a positive number in our input. Therefore, when given:
@example
- 1.2 3.4 5.6 7.8
- 9.10 11.12 13.14 15.16
-17.18 19.20 21.22 23.24
+ 1.2 3.4 5.6 7.8
+ 9.10 11.12 -13.14 15.16
+17.18 19.20 21.22 23.24
@end example
@noindent
@@ -6286,7 +7317,6 @@ this program, using our function to format the results, prints:
@example
5.6
- 13.1
21.2
@end example
@@ -6342,7 +7372,7 @@ z = myfunc(foo)
@noindent
then you should not think of the argument to @code{myfunc} as being
-``the variable @code{foo}''. Instead, think of the argument as the
+``the variable @code{foo}.'' Instead, think of the argument as the
string value, @code{"bar"}.
If the function @code{myfunc} alters the values of its local variables,
@@ -6407,7 +7437,9 @@ value is undefined and, therefore, unpredictable.
A @code{return} statement with no value expression is assumed at the end of
every function definition. So if control reaches the end of the function
-definition, then the function returns an unpredictable value.
+body, then the function returns an unpredictable value. @code{awk}
+will not warn you if you use the return value of such a function; you will
+simply get unpredictable or unexpected results.
Here is an example of a user-defined function that returns a value
for the largest number among the elements of an array:@refill
@@ -6481,8 +7513,8 @@ is the largest number in our array.
@cindex built-in variables
Most @code{awk} variables are available for you to use for your own
-purposes; they never change except when your program assigns them, and
-never affect anything except when your program examines them.
+purposes; they never change except when your program assigns values to
+them, and never affect anything except when your program examines them.
A few variables have special built-in meanings. Some of them @code{awk}
examines automatically, so that they enable you to tell @code{awk} how
@@ -6508,9 +7540,33 @@ This is a list of the variables which you can change to control how
@code{awk} does certain things.
@table @code
-@c it's unadvisable to have multiple index entries for the same name
-@c since in Info there is no way to distinguish the two.
-@c @vindex FS
+@iftex
+@vindex CONVFMT
+@end iftex
+@item CONVFMT
+This string is used by @code{awk} to control conversion of numbers to
+strings (@pxref{Conversion}). It works by being passed, in effect, as
+the first argument to the @code{sprintf} function. Its default value
+is @code{"%.6g"}. @code{CONVFMT} was introduced by the @sc{POSIX}
+standard.@refill
+
+@iftex
+@vindex FIELDWIDTHS
+@end iftex
+@item FIELDWIDTHS
+This is a space separated list of columns that tells @code{gawk}
+how to manage input with fixed, columnar boundaries. It is an
+experimental feature that is still evolving. Assigning to @code{FIELDWIDTHS}
+overrides the use of @code{FS} for field splitting.
+@xref{Constant Size}, for more information.@refill
+
+If @code{gawk} is in compatibility mode (@pxref{Command Line}), then
+@code{FIELDWIDTHS} has no special meaning, and field splitting operations are
+done based exclusively on the value of @code{FS}.
+
+@iftex
+@vindex FS
+@end iftex
@item FS
@code{FS} is the input field separator (@pxref{Field Separators}).
The value is a single-character string or a multi-character regular
@@ -6529,8 +7585,14 @@ You can set the value of @code{FS} on the command line using the
awk -F, '@var{program}' @var{input-files}
@end example
+If @code{gawk} is using @code{FIELDWIDTHS} for field-splitting,
+assigning a value to @code{FS} will cause @code{gawk} to return to
+the normal, regexp-based, field splitting.
+
@item IGNORECASE
-@c @vindex IGNORECASE
+@iftex
+@vindex IGNORECASE
+@end iftex
If @code{IGNORECASE} is nonzero, then @emph{all} regular expression
matching is done in a case-independent fashion. In particular, regexp
matching with @samp{~} and @samp{!~}, and the @code{gsub} @code{index},
@@ -6545,54 +7607,69 @@ If @code{gawk} is in compatibility mode (@pxref{Command Line}), then
always case-sensitive.@refill
@item OFMT
-@c @vindex OFMT
+@iftex
+@vindex OFMT
+@end iftex
This string is used by @code{awk} to control conversion of numbers to
-strings (@pxref{Conversion}). It works by being passed, in effect, as
-the first argument to the @code{sprintf} function. Its default value
-is @code{"%.6g"}.@refill
+strings (@pxref{Conversion}) for printing with the @code{print} statement.
+It works by being passed, in effect, as the first argument to the
+@code{sprintf} function. Its default value is @code{"%.6g"}.
+Earlier versions of @code{awk} also used @code{OFMT} to specify the
+format for converting numbers to strings in general expressions; this
+has been taken over by @code{CONVFMT}.@refill
@item OFS
-@c @vindex OFS
+@iftex
+@vindex OFS
+@end iftex
This is the output field separator (@pxref{Output Separators}). It is
output between the fields output by a @code{print} statement. Its
default value is @w{@code{" "}}, a string consisting of a single space.
@item ORS
-@c @vindex ORS
+@iftex
+@vindex ORS
+@end iftex
This is the output record separator. It is output at the end of every
@code{print} statement. Its default value is a string containing a
single newline character, which could be written as @code{"\n"}.
(@xref{Output Separators}).@refill
@item RS
-@c @vindex RS
-This is @code{awk}'s record separator. Its default value is a string
+@iftex
+@vindex RS
+@end iftex
+This is @code{awk}'s input record separator. Its default value is a string
containing a single newline character, which means that an input record
consists of a single line of text. (@xref{Records}.)@refill
@item SUBSEP
-@c @vindex SUBSEP
-@code{SUBSEP} is a subscript separator. It has the default value of
+@iftex
+@vindex SUBSEP
+@end iftex
+@code{SUBSEP} is the subscript separator. It has the default value of
@code{"\034"}, and is used to separate the parts of the name of a
multi-dimensional array. Thus, if you access @code{foo[12,3]}, it
-really accesses @code{foo["12\0343"]}. (@xref{Multi-dimensional}).@refill
+really accesses @code{foo["12\0343"]} (@pxref{Multi-dimensional}).@refill
@end table
@node Auto-set, , User-modified, Built-in Variables
@section Built-in Variables That Convey Information to You
This is a list of the variables that are set automatically by @code{awk}
-on certain occasions so as to provide information for your program.
+on certain occasions so as to provide information to your program.
@table @code
@item ARGC
@itemx ARGV
-@c @vindex ARGC
-@c @vindex ARGV
-The command-line arguments available to @code{awk} are stored in an
-array called @code{ARGV}. @code{ARGC} is the number of command-line
-arguments present. @code{ARGV} is indexed from zero to @w{@code{ARGC - 1}}.
-@xref{Command Line}. For example:
+@iftex
+@vindex ARGC
+@vindex ARGV
+@end iftex
+The command-line arguments available to @code{awk} programs are stored in
+an array called @code{ARGV}. @code{ARGC} is the number of command-line
+arguments present. @xref{Command Line}. @code{ARGV} is indexed from zero
+to @w{@code{ARGC - 1}}. For example:
@example
awk '@{ print ARGV[$1] @}' inventory-shipped BBS-list
@@ -6605,6 +7682,10 @@ contains @code{"inventory-shipped"}, and @code{ARGV[2]} contains
index of the last element in @code{ARGV} since the elements are numbered
from zero.@refill
+The names @code{ARGC} and @code{ARGV}, as well the convention of indexing
+the array from 0 to @w{@code{ARGC - 1}}, are derived from the C language's
+method of accessing command line arguments.@refill
+
Notice that the @code{awk} program is not entered in @code{ARGV}. The
other special command line options, with their arguments, are also not
entered. But variable assignments on the command line @emph{are}
@@ -6628,6 +7709,10 @@ To eliminate a file from the middle of the list, store the null string
special feature, @code{awk} ignores file names that have been
replaced with the null string.
+@ignore
+see getopt.awk in the examples...
+@end ignore
+
@item ENVIRON
@vindex ENVIRON
This is an array that contains the values of the environment. The array
@@ -6642,42 +7727,54 @@ Some operating systems may not have environment variables.
On such systems, the array @code{ENVIRON} is empty.
@item FILENAME
-@c @vindex FILENAME
+@iftex
+@vindex FILENAME
+@end iftex
This is the name of the file that @code{awk} is currently reading.
If @code{awk} is reading from the standard input (in other words,
there are no files listed on the command line),
@code{FILENAME} is set to @code{"-"}.
-@code{FILENAME} is changed each time a new file is read (@pxref{Reading
-Files}).@refill
+@code{FILENAME} is changed each time a new file is read
+(@pxref{Reading Files}).@refill
@item FNR
-@c @vindex FNR
+@iftex
+@vindex FNR
+@end iftex
@code{FNR} is the current record number in the current file. @code{FNR} is
incremented each time a new record is read (@pxref{Getline}).
It is reinitialized to 0 each time a new input file is started.
@item NF
-@c @vindex NF
+@iftex
+@vindex NF
+@end iftex
@code{NF} is the number of fields in the current input record.
@code{NF} is set each time a new record is read, when a new field is
created, or when @code{$0} changes (@pxref{Fields}).@refill
@item NR
-@c @vindex NR
+@iftex
+@vindex NR
+@end iftex
This is the number of input records @code{awk} has processed since
the beginning of the program's execution. (@pxref{Records}).
@code{NR} is set each time a new record is read.@refill
@item RLENGTH
-@c @vindex RLENGTH
+@iftex
+@vindex RLENGTH
+@end iftex
@code{RLENGTH} is the length of the substring matched by the
@code{match} function (@pxref{String Functions}). @code{RLENGTH} is set
by invoking the @code{match} function. Its value is the length of the
matched string, or @minus{}1 if no match was found.@refill
@item RSTART
-@c @vindex RSTART
-@code{RSTART} is the start-index of the substring matched by the
+@iftex
+@vindex RSTART
+@end iftex
+@code{RSTART} is the start-index in characters of the substring matched by the
@code{match} function (@pxref{String Functions}). @code{RSTART} is set
by invoking the @code{match} function. Its value is the position of the
string where the matched substring starts, or 0 if no match was
@@ -6697,8 +7794,8 @@ one or more program files. Here are templates for both of them; items
enclosed in @samp{@r{[}@dots{}@r{]}} in these templates are optional.
@example
-awk @r{[@code{-F@var{fs}}] [@code{-v @var{var}=@var{val}}] [@code{-V}] [@code{-C}] [@code{-c}] [@code{-a}] [@code{-e}] [@code{--}]} '@var{program}' @var{file} @dots{}
-awk @r{[@code{-F@var{fs}}] @code{-f @var{source-file}} [@code{-f @var{source-file} @dots{}}] [@code{-v @var{var}=@var{val}}] [@code{-V}] [@code{-C}] [@code{-c}] [@code{-a}] [@code{-e}] [@code{--}]} @var{file} @dots{}
+awk @r{[@code{-F@var{fs}}] [@code{-W} @var{gawk-opts}] [@code{-v @var{var}=@var{val}}] [@code{--}]} '@var{program}' @var{file} @dots{}
+awk @r{[@code{-F@var{fs}}] [@code{-W} @var{gawk-opts}] [@code{-v @var{var}=@var{val}}] @code{-f @var{source-file}} [@code{-f @var{source-file} @dots{}}] [@code{--}]} @var{file} @dots{}
@end example
@menu
@@ -6727,62 +7824,88 @@ Sets the variable @var{var} to the value @var{val} @emph{before}
execution of the program begins. Such variable values are available
inside the @code{BEGIN} rule (see below for a fuller explanation).
-The @samp{-v} option only has room to set one variable, but you can use
+The @samp{-v} option can only set one variable, but you can use
it more than once, setting another variable each time, like this:
@samp{@w{-v foo=1} @w{-v bar=2}}.
-@item -a
-Specifies use of traditional @code{awk} syntax for regular expressions.
-This means that @samp{\} can be used to quote any regular expression
-operators inside of square brackets, just as it can be outside of them.
-This mode is currently the default; the @samp{-a} option is useful in
-shell scripts so that they will not break if the default is changed.
-@xref{Regexp Operators}.
-
-@item -e
-Specifies use of @code{egrep} syntax for regular expressions. This
-means that @samp{\} does not serve as a quoting character inside of
-square brackets; ideosyncratic techniques are needed to include various
-special characters within them. This mode may become the default at
-some time in the future. @xref{Regexp Operators}.
+@item -W @var{gawk-opt}
+Following the @sc{POSIX} standard, options that are specific to @code{gawk}
+are supplied as arguments to the @samp{-W} option. These arguments
+may be separated by commas, or quoted and separated by whitespace.
+Case is ignored when processing these options. The following options
+are available:
-@item -c
-@cindex @samp{-c} option
+@table @code
+@item compat
Specifies @dfn{compatibility mode}, in which the GNU extensions in
@code{gawk} are disabled, so that @code{gawk} behaves just like Unix
-@code{awk}. These extensions are noted below, where their usage is
-explained. @xref{Compatibility Mode}.
+@code{awk}. @xref{POSIX/GNU}, which summarizes the extensions.
+Also see @ref{Compatibility Mode}.
-@item -V
-@cindex @samp{-V} option
+@item lint
+Provide warnings about constructs that are dubious or non-portable to
+other @code{awk} implementations.
+
+@item copyleft
+@itemx copyright
+Print the short version of the General Public License.
+This option may disappear in a future version of @code{gawk}.
+
+@item posix
+Operate in strict @sc{POSIX} mode. This disables all @code{gawk}
+extensions (just like @code{compat}), and adds the following additional
+restrictions:
+
+@itemize @bullet{}
+@item
+@code{\x} escape sequences are not recognized (@pxref{Constants}).
+
+@item
+The synonym @code{func} for the keyword @code{function} is not
+recognized (@pxref{Definition Syntax}).
+
+@item
+The operators @samp{**} and @samp{**=} cannot be used in
+place of @samp{^} and @samp{^=} (@pxref{Arithmetic Ops}, and also
+@pxref{Assignment Ops}).@refill
+
+@item
+Specifying @samp{-Ft} on the command line does not set the value
+of @code{FS} to be a single tab character (@pxref{Field Separators}).
+@end itemize
+
+Although you can supply both @samp{-W compat} and @samp{-W posix} on the
+command line, @samp{-W posix} will take precedence.
+
+@item version
Prints version information for this particular copy of @code{gawk}.
This is so you can determine if your copy of @code{gawk} is up to date
with respect to whatever the Free Software Foundation is currently
distributing. This option may disappear in a future version of @code{gawk}.
-
-@item -C
-@cindex @samp{-C} option
-Prints the short version of the General Public License.
-This option may disappear in a future version of @code{gawk}.
+@end table
@item --
Signals the end of the command line options. The following arguments
are not treated as options even if they begin with @samp{-}. This
-interpretation of @samp{--} follows the POSIX argument parsing
+interpretation of @samp{--} follows the @sc{POSIX} argument parsing
conventions.
This is useful if you have file names that start with @samp{-},
or in shell scripts, if you have file names that will be specified
-by the user and that might start with @samp{-}.
+by the user which could start with @samp{-}.
@end table
+The @samp{-a}, @samp{-e}, @samp{-c}, @samp{-C}, and @samp{-V} options
+of @code{gawk} version 2.11.1 are recognized, but produce a warning
+message. They will go away in the next major release of @code{gawk}.
+
Any other options are flagged as invalid with a warning message, but
are otherwise ignored.
In compatibility mode, as a special case, if the value of @var{fs} supplied
to the @samp{-F} option is @samp{t}, then @code{FS} is set to the tab
-character (@code{"\t"}). Also, the @samp{-C} and @samp{-V} options
-are not recognized.@refill
+character (@code{"\t"}). This is only true for @samp{-W compat}, and not
+for @samp{-W posix} (@pxref{Field Separators}).@refill
If the @samp{-f} option is @emph{not} used, then the first non-option
command line argument is expected to be the program text.
@@ -6797,6 +7920,9 @@ type in a program at the terminal and use library functions, by specifying
@samp{-f /dev/tty}. @code{awk} will read a file from the terminal
to use as part of the @code{awk} program. After typing your program,
type @kbd{Control-d} (the end-of-file character) to terminate it.
+(You may also use @samp{-f -} to read program source from the standard
+input, but then you won't be able to also use the standard input as a
+source of data.)
@node Other Arguments, AWKPATH Variable, Options, Command Line
@section Other Command Line Arguments
@@ -6824,14 +7950,15 @@ Therefore, the variables actually receive the specified values after all
previously specified files have been read. In particular, the values of
variables assigned in this fashion are @emph{not} available inside a
@code{BEGIN} rule (@pxref{BEGIN/END}), since such rules are run before
-@code{awk} begins scanning the argument list.@refill
+@code{awk} begins scanning the argument list. The values given on the
+command line are processed for escape sequences (@pxref{Constants}).@refill
In some earlier implementations of @code{awk}, when a variable assignment
occurred before any file names, the assignment would happen @emph{before}
the @code{BEGIN} rule was executed. Some applications came to depend
-upon this ``feature''. When @code{awk} was changed to be more consistent,
+upon this ``feature.'' When @code{awk} was changed to be more consistent,
the @samp{-v} option was added to accomodate applications that depended
-upon this old behaviour.
+upon this old behavior.
The variable assignment feature is most useful for assigning to variables
such as @code{RS}, @code{OFS}, and @code{ORS}, which control input and
@@ -6846,13 +7973,18 @@ awk 'pass == 1 @{ @var{pass 1 stuff} @}
pass == 2 @{ @var{pass 2 stuff} @}' pass=1 datafile pass=2 datafile
@end example
+Given the variable assignment feature, the @samp{-F} option is not
+strictly necessary. It remains for historical compatibility.
+
@node AWKPATH Variable,, Other Arguments, Command Line
@section The @code{AWKPATH} Environment Variable
@cindex @code{AWKPATH} environment variable
@cindex search path
@cindex directory search
@cindex path, search
-@c @cindex differences between @code{gawk} and @code{awk}
+@iftex
+@cindex differences between @code{gawk} and @code{awk}
+@end iftex
The previous section described how @code{awk} program files can be named
on the command line with the @samp{-f} option. In some @code{awk}
@@ -6864,11 +7996,13 @@ does not contain a @samp{/}, then @code{gawk} searches a list of
directories (called the @dfn{search path}), one by one, looking for a
file with the specified name.
-The search path is actually a string containing directory names
+The search path is actually a string consisting of directory names
separated by colons. @code{gawk} gets its search path from the
@code{AWKPATH} environment variable. If that variable does not exist,
@code{gawk} uses the default path, which is
-@samp{.:/usr/lib/awk:/usr/local/lib/awk}.@refill
+@samp{.:/usr/lib/awk:/usr/local/lib/awk}. (Programs written by
+system administrators should use an @code{AWKPATH} variable that
+does not include the current directory, @samp{.}.)@refill
The search path feature is particularly useful for building up libraries
of useful @code{awk} functions. The library files can be placed in a
@@ -6877,6 +8011,7 @@ the command line with a short file name. Otherwise, the full file name
would have to be typed for each file.
Path searching is not done if @code{gawk} is in compatibility mode.
+This is true for both @samp{-W compat} and @samp{-W posix}.
@xref{Command Line}.
@strong{Note:} if you want files in the current directory to be found,
@@ -6889,21 +8024,27 @@ found in the current directory. This path search mechanism is identical
to the shell's.
@c someday, @cite{The Bourne Again Shell}....
-@node Language History, Gawk Summary, Command Line, Top
+@node Language History, Installation, Command Line, Top
@chapter The Evolution of the @code{awk} Language
This manual describes the GNU implementation of @code{awk}, which is patterned
-after the System V Release 4 version. Many @code{awk} users are only familiar
+after the @sc{POSIX} specification. Many @code{awk} users are only familiar
with the original @code{awk} implementation in Version 7 Unix, which is also
-the basis for the version in Berkeley Unix. This chapter briefly describes
-the evolution of the @code{awk} language.
+the basis for the version in Berkeley Unix (through 4.3--Reno). This chapter
+briefly describes the evolution of the @code{awk} language.
@menu
-* V7/S5R3.1:: The major changes between V7 and System V Release 3.1.
+* V7/S5R3.1:: The major changes between V7 and System V Release 3.1.
+
+* S5R4:: The minor changes between System V Releases 3.1 and 4.
+
+* POSIX:: New features from the @sc{POSIX} standard.
-* S5R4:: The minor changes between System V Releases 3.1 and 4.
+* POSIX/GNU:: The extensions in @code{gawk} not in @sc{POSIX} @code{awk}.
-* S5R4/GNU:: The extensions in @code{gawk} not in System V Release 4.
+* Obsolete:: Obsolete Options and/or features.
+
+* Undocumented:: Undocumented Options and Features.
@end menu
@node V7/S5R3.1, S5R4, Language History, Language History
@@ -6938,8 +8079,7 @@ The built-in functions @code{gsub}, @code{sub}, and @code{match}
(@pxref{String Functions}).
@item
-The built-in functions @code{close} and @code{system} (@pxref{I/O
-Functions}).
+The built-in functions @code{close} and @code{system} (@pxref{I/O Functions}).
@item
The @code{ARGC}, @code{ARGV}, @code{FNR}, @code{RLENGTH}, @code{RSTART},
@@ -6958,9 +8098,8 @@ C-compatible operator precedence, which breaks some old @code{awk}
programs (@pxref{Precedence}).
@item
-Regexps as the value of @code{FS} (@pxref{Field Separators}), or as the
-third argument to the @code{split} function (@pxref{String
-Functions}).@refill
+Regexps as the value of @code{FS} (@pxref{Field Separators}), and as the
+third argument to the @code{split} function (@pxref{String Functions}).@refill
@item
Dynamic regexps as operands of the @samp{~} and @samp{!~} operators
@@ -6983,10 +8122,11 @@ Multiple @code{BEGIN} and @code{END} rules (@pxref{BEGIN/END}).
Simulation of multidimensional arrays (@pxref{Multi-dimensional}).
@end itemize
-@node S5R4, S5R4/GNU, V7/S5R3.1, Language History
-@section Minor Changes between S5R3.1 and S5R4
+@node S5R4, POSIX, V7/S5R3.1, Language History
+@section Changes between S5R3.1 and S5R4
-The System V Release 4 version of Unix @code{awk} added these features:
+The System V Release 4 version of Unix @code{awk} added these features
+(some of which originated in @code{gawk}):
@itemize @bullet
@item
@@ -7018,13 +8158,47 @@ A cleaner specification for the @samp{%c} format-control letter in the
@code{printf} function (@pxref{Printf}).
@item
+The ability to dynamically pass the field width and precision (@code{"%*.*d"})
+in the argument list of the @code{printf} function (@pxref{Printf}).
+
+@item
The use of constant regexps such as @code{/foo/} as expressions, where
they are equivalent to use of the matching operator, as in @code{$0 ~
-/foo/}.
+/foo/} (@pxref{Constants}).
+@end itemize
+
+@node POSIX, POSIX/GNU, S5R4, Language History
+@section Changes Between S5R4 and the POSIX Standard for @code{awk}
+@c
+@c @section Changes Between S5R4 and the @sc{POSIX} Standard for @code{awk}
+@c
+@c when the POSIX is inside a @sc{} in a section, it comes out in
+@c a typewriter font, which isn't what we want. texinfo bug.
+
+The @sc{POSIX} Command Language And Utilities Standard for @code{awk}
+introduced the following changes into the language:
+
+@itemize @bullet{}
+@item
+The use of @samp{-W} for implementation-specific options.
+
+@item
+The use of @code{CONVFMT} for controlling the conversion of numbers
+to strings (@pxref{Conversion}).
+
+@item
+The concept of a numeric string, and tighter comparison rules to go
+with it (@pxref{Comparison Ops}).
+
+@item
+More complete documentation of many of the previously undocumented
+features of the language.
@end itemize
-@node S5R4/GNU, , S5R4, Language History
-@section Extensions In @code{gawk} Not In S5R4
+@node POSIX/GNU, Obsolete, POSIX, Language History
+@section Extensions In @code{gawk} Not In POSIX @code{awk}
+@c
+@c @section Extensions In @code{gawk} Not In @sc{POSIX} @code{awk}
The GNU implementation, @code{gawk}, adds these features:
@@ -7034,31 +8208,734 @@ The @code{AWKPATH} environment variable for specifying a path search for
the @samp{-f} command line option (@pxref{Command Line}).
@item
-The @samp{-C} and @samp{-V} command line options (@pxref{Command Line}).
+The various @code{gawk} specific features available via the @samp{-W}
+command line option (@pxref{Command Line}).
@item
The @code{IGNORECASE} variable and its effects (@pxref{Case-sensitivity}).
@item
+The @code{FIELDWIDTHS} variable and its effects (@pxref{Constant Size}).
+
+@item
+The @code{systime} and @code{strftime} built-in functions for obtaining
+and printing time stamps (@pxref{Time Functions}).
+
+@item
The @file{/dev/stdin}, @file{/dev/stdout}, @file{/dev/stderr}, and
@file{/dev/fd/@var{n}} file name interpretation (@pxref{Special Files}).
@item
-The @samp{-c} option to turn off these extensions (@pxref{Command Line}).
+The @samp{-W compat} option to turn off these extensions (@pxref{Command Line}).
@item
-The @samp{-a} and @samp{-e} options to specify the syntax of regular
-expressions that @code{gawk} will accept (@pxref{Command Line}).
+The @samp{-W posix} option for full @sc{POSIX} compliance
+(@pxref{Command Line}).@refill
+
@end itemize
-@node Gawk Summary, Sample Program, Language History, Top
-@appendix @code{gawk} Summary
+@node Obsolete, Undocumented, POSIX/GNU, Language History
+@section Obsolete Options and/or Features
+
+@cindex deprecated options
+@cindex obsolete options
+@cindex deprecated features
+@cindex obsolete features
+This section describes features and/or command line options from the
+previous release of @code{gawk} that are either not available in the
+current version, or that are still supported but deprecated (meaning that
+they will @emph{not} be in the next release).
+
+@c update this section for each release!
+For version 2.13 of @code{gawk}, the following command line options
+are recognized, but produce a warning message (@pxref{Command Line}).
+
+@table @samp
@ignore
-See, man pages are good for something. This chapter started life as the
-gawk.1 man page for 2.11.
+@item -nostalgia
+Use @samp{-W nostalgia} instead.
@end ignore
+@item -c
+Use @samp{-W compat} instead.
+
+@item -V
+Use @samp{-W version} instead.
+
+@item -C
+Use @samp{-W copyright} instead.
+
+@item -a
+@itemx -e
+These options produce a warning message but have no effect on the
+execution of @code{gawk}. The @sc{POSIX} standard now specifies
+traditional @code{awk} regular expressions for the @code{awk} utility.
+@end table
+
+@node Undocumented, , Obsolete, Language History
+@section Undocumented Options and Features
+
+This section intentionally left blank.
+
+@c Read The Source, Luke!
+
+@ignore
+@c If these came out in the Info file or TeX manual, then they wouldn't
+@c be undocumented, would they?
+
+@code{gawk} has one undocumented option:
+
+@table @samp
+@item -W nostalgia
+Print the message @code{"awk: bailing out near line 1"} and dump core.
+This option was inspired by the common behavior of very early versions of
+Unix @code{awk}, and by a t--shirt.
+@end table
+
+Early versions of @code{awk} used to not require any separator (either
+a newline or @samp{;}) between the rules in @code{awk} programs. Thus,
+it was common to see one-line programs like:
+
+@example
+awk '@{ sum += $1 @} END @{ print sum @}'
+@end example
+
+@code{gawk} actually supports this, but it is purposely undocumented
+since it is considered bad style. The correct way to write such a program
+is either
+
+@example
+awk '@{ sum += $1 @} ; END @{ print sum @}'
+@end example
+
+@noindent
+or
+
+@example
+awk '@{ sum += $1 @}
+ END @{ print sum @}' data
+@end example
+
+@noindent
+@xref{Statements/Lines}, for a fuller explanation.
+
+As an accident of the implementation of the original Unix @code{awk}, if
+a built-in function used @code{$0} as its default argument, it was possible
+to call that function without the parentheses. In particular, it was
+common practice to use the @code{length} function in this fashion.
+For example, the pipeline:
+
+@example
+echo abcdef | awk '@{ print length @}'
+@end example
+
+@noindent
+would print @samp{6}.
+
+For backwards compatibility with old programs, @code{gawk} supports
+this usage, but only for the @code{length} function. New programs should
+@emph{not} call the @code{length} function this way. In particular,
+this usage will not be portable to other @sc{POSIX} compliant versions
+of @code{awk}. It is also poor style.
+
+@end ignore
+
+@node Installation, Gawk Summary, Language History, Top
+@chapter Installing @code{gawk}
+
+This chapter provides instructions for installing @code{gawk} on the
+various platforms that are supported by the developers. The primary
+developers support Unix (and one day, GNU), while the other ports were
+contributed. The file @file{ACKNOWLEDGMENT} in the @code{gawk}
+distribution lists the electronic mail addresses of the people who did
+the respective ports.@refill
+
+@menu
+* Gawk Distribution:: What is in the @code{gawk} distribution.
+
+* Unix Installation:: Installing @code{gawk} under various versions
+ of Unix.
+
+* VMS Installation:: Installing @code{gawk} on VMS.
+
+* MS-DOS Installation:: Installing @code{gawk} on MS-DOS.
+
+* Atari Installation:: Installing @code{gawk} on the Atari ST.
+@end menu
+
+@node Gawk Distribution, Unix Installation, Installation, Installation
+@section The @code{gawk} Distribution
+
+@ifinfo
+This section first describes how to get and extract the @code{gawk}
+distribution, and then discusses what is in the various files and
+subdirectories.
+@end ifinfo
+
+@menu
+* Extracting:: How to get and extract the distribution.
+
+* Distribution contents:: What is in the distribution.
+@end menu
+
+@node Extracting, Distribution contents, Gawk Distribution, Gawk Distribution
+@subsection Getting The @code{gawk} Distribution
+
+@cindex getting gawk
+@cindex anonymous ftp
+@cindex anonymous uucp
+@cindex ftp, anonymous
+@cindex uucp, anonymous
+@code{gawk} is distributed as a compressed @code{tar} file. You can
+get it via anonymous @code{ftp} to the Internet host @code{prep.ai.mit.edu}.
+Like all GNU software, it will be archived at other well known systems,
+from which it will be possible to use some sort of anonymous @code{uucp} to
+obtain the distribution as well.
+
+Once you have the distribution (for example, @file{gawk-2.13.3.tar.Z}), first
+use @code{uncompress} to expand the file, and then use @code{tar} to extract it.
+@code{uncompress} usually has a link named @code{zcat}, which causes it
+to decompress the file to the standard output. You can use the following
+pipeline to produce the @code{gawk} distribution:
+
+@example
+# Under System V, add 'o' to the tar flags
+zcat gawk-2.13.3.tar.Z | tar -xvpf -
+@end example
+
+@noindent
+This will create a directory named @file{gawk-2.13} in the current
+directory.
+
+The distribution file name is of the form @file{gawk-2.13.@var{n}.tar.Z}.
+The @var{n} represents a @dfn{patchlevel}, meaning that minor bugs have
+been fixed in the major release. The current patchlevel is 3, but when
+retrieving distributions, you should get the version with the highest
+patchlevel.@refill
+
+If you are not on a Unix system, you will need to make other arrangements
+for getting and extracting the @code{gawk} distribution. You should consult
+a local expert.
+
+@node Distribution contents, , Extracting, Gawk Distribution
+@subsection Contents Of The @code{gawk} Distribution
+
+@code{gawk} has a number of C source files, documentation files,
+subdirectories and files related to the configuration process
+(@pxref{Unix Installation}), and several subdirectories related to
+different, non-Unix, operating systems.@refill
+
+@table @asis
+@item The C and YACC source files
+The various @samp{.c}, @samp{.y}, and @samp{.h} files are the actual
+@code{gawk} source code.
+@end table
+
+@table @file
+@item README
+@itemx README.VMS
+@itemx README.dos
+@itemx README.rs6000
+@itemx README.ultrix
+Descriptive files: @file{README} for @code{gawk} under Unix, and the
+rest for the various hardware and software combinations.
+
+@item PORTS
+A list of systems to which @code{gawk} has been ported, and which
+have successfully run the test suite.
+
+@item ACKNOWLEDGMENT
+A list of the people who contributed major parts of the code or documentation.
+
+@item CHANGES
+A list of changes to @code{gawk} since the last release or patch.
+
+@item COPYING
+The GNU General Public License.
+
+@item FUTURES
+A brief list of features and/or changes being contemplated for future
+releases, with some indication of the time frame for the feature, based
+on its difficulty.
+
+@item LIMITATIONS
+A list of those factors that limit @code{gawk}'s performance.
+Most of these depend on the hardware or operating system software, and
+are not limits in @code{gawk} itself.@refill
+
+@item PROBLEMS
+A file describing known problems with the current release.
+
+@item gawk.1
+The @code{troff} source for a manual page describing @code{gawk}.
+
+@item gawk.texinfo
+@ifinfo
+The @code{texinfo} source file for this Info file.
+It should be processed with @TeX{} to produce a printed manual, and
+with @code{makeinfo} to produce the Info file.@refill
+@end ifinfo
+@iftex
+The @code{texinfo} source file for this manual.
+It should be processed with @TeX{} to produce a printed manual, and
+with @code{makeinfo} to produce the Info file.@refill
+@end iftex
+
+@item Makefile-dist
+@itemx config
+@itemx config.h-dist
+@itemx configure
+@itemx missing
+@itemx mkconf
+@itemx mungeconf
+These files and subdirectories are used when configuring @code{gawk}
+for various Unix systems. They are explained in detail in
+@ref{Unix Installation}.@refill
+
+@item atari
+Files needed for building @code{gawk} on an Atari ST.
+@xref{Atari Installation}, for details.
+
+@item pc
+Files needed for building @code{gawk} under MS-DOS.
+@xref{MS-DOS Installation}, for details.
+
+@item vms
+Files needed for building @code{gawk} under VAX VMS.
+@xref{VMS Installation}, for details.
+
+@item test
+Many interesting @code{awk} programs, provided as a test suite for
+@code{gawk}. You can use @samp{make test} from the top level @code{gawk}
+directory to run your version of @code{gawk} against the test suite.
+There are many programs here that are useful in their own right.
+If @code{gawk} successfully passes @samp{make bigtest} then you can
+be confident of a successful port.@refill
+@end table
+
+@node Unix Installation, VMS Installation, Gawk Distribution, Installation
+@section Compiling And Installing @code{gawk} on Unix
+
+@menu
+* Quick Installation:: Compiling @code{gawk} on a supported Unix version.
+
+* Configuration Philosophy:: How it's all supposed to work.
+
+* New Configurations:: What to do if there is no supplied configuration
+ for your system.
+@end menu
+
+@node Quick Installation, Configuration Philosophy, Unix Installation, Unix Installation
+@subsection Compiling @code{gawk} For A Supported Unix Version
+
+@cindex installation, unix
+After you have extracted the @code{gawk} distribution, @code{cd}
+to @file{gawk-2.13}. Look in the @file{config} subdirectory for a
+file that matches your hardware/software combination. In general,
+only the software is relevant, for example @code{sunos41} is used
+for SunOS 4.1, on both Sun 3 and Sun 4 hardware.@refill
+
+If you find such a file, run the command:
+
+@example
+# assume you have SunOS 4.1
+./configure sunos41
+@end example
+
+This produces a @file{Makefile} and @file{config.h} tailored to your
+system. You may wish to edit the @file{Makefile} to use a different
+C compiler, such as @code{gcc}, the GNU C compiler, if you have it.
+You may also wish to change the @code{CFLAGS} variable, which controls
+the command line options that are passed to the C compiler (such as
+optimization levels, or compiling for debugging).@refill
+
+After you have configured @file{Makefile} and @file{config.h}, type:
+
+@example
+make
+@end example
+
+@noindent
+and shortly thereafter, you should have an executable version of @code{gawk}.
+That's all there is to it!
+
+@node Configuration Philosophy, New Configurations, Quick Installation, Unix Installation
+@subsection The Configuration Process
+
+(This section is of interest only if you know something about using the
+C language and the Unix operating system.)
+
+The source code for @code{gawk} generally attempts to adhere to industry
+standards wherever possible. This means that @code{gawk} uses library
+routines that are specified by the @sc{ANSI} C standard and by the @sc{POSIX}
+operating system interface standard. When using an @sc{ANSI} C compiler,
+function prototypes are provided to help improve the compile-time checking.
+
+Many older Unix systems do not support all of either the @sc{ANSI} or the
+@sc{POSIX} standards. The @file{missing} subdirectory in the @code{gawk}
+distribution contains replacement versions of those subroutines that are
+most likely to be missing.
+
+The @file{config.h} file that is created by the @code{configure} program
+contains definitions that describe features of the particular operating
+system where you are attempting to compile @code{gawk}. For the most
+part, it lists which standard subroutines are @emph{not} available.
+For example, if your system lacks the @samp{getopt} routine, then
+@samp{GETOPT_MISSING} would be defined.
+
+@file{config.h} also defines constants that describe facts about your
+variant of Unix. For example, there may not be an @samp{st_blksize}
+element in the @code{stat} structure. In this case @samp{BLKSIZE_MISSING}
+would be defined.
+
+Based on the list in @file{config.h} of standard subroutines that are
+missing, @file{missing.c} will do a @samp{#include} of the appropriate
+file(s) from the @file{missing} subdirectory.@refill
+
+Conditionally compiled code in the other source files relies on the
+other definitions in the @file{config.h} file.
+
+Besides creating @file{config.h}, @code{configure} produces a @file{Makefile}
+from @file{Makefile-dist}. There are a number of lines in @file{Makefile-dist}
+that are system or feature specific. For example, there is line that begins
+with @samp{##MAKE_ALLOCA_C##}. This is normally a comment line, since
+it starts with @samp{#}. If a configuration file has @samp{MAKE_ALLOCA_C}
+in it, then @code{configure} will delete the @samp{##MAKE_ALLOCA_C##}
+from the beginning of the line. This will enable the rules in the
+@file{Makefile} that use a C version of @samp{alloca}. There are several
+similar features that work in this fashion.@refill
+
+The file @file{mkconf} is a link to @file{configure}. This name for
+the program is left over from an earlier patchlevel of @code{gawk} 2.13.
+For the next release of @code{gawk}, the distribution will comply fully
+with the GNU standards for software distributions. @file{Makefile-dist}
+will be renamed @file{Makefile.in}, and @file{mkconf} will go away.@refill
+
+@node New Configurations, , Configuration Philosophy, Unix Installation
+@subsection Configuring @code{gawk} For A New System
+
+(This section is of interest only if you know something about using the
+C language and the Unix operating system, and if you have to install
+@code{gawk} on a system that is not supported by the @code{gawk} distribution.
+If you are a C or Unix novice, get help from a local expert.)
+
+If you need to configure @code{gawk} for a Unix system that is not
+supported in the distribution, first see @ref{Configuration Philosophy}.
+Then, copy @file{config.h-dist} to @file{config.h}, and copy
+@file{Makefile-dist} to @file{Makefile}.@refill
+
+Next, edit both files. Both files are liberally commented, and the
+necessary changes should be straightforward.
+
+While editing @file{config.h}, you need to determine what library
+routines you do or do not have by consulting your system documentation, or
+by perusing your actual libraries using the @code{ar} or @code{nm} utilities.
+In the worst case, simply do not define @emph{any} of the macros for missing
+subroutines. When you compile @code{gawk}, the final link-editing step
+will fail. The link editor will provide you with a list of unresolved external
+references---these are the missing subroutines. Edit @file{config.h} again
+and recompile, and you should be set.@refill
+
+Editing the @file{Makefile} should also be straightforward. Enable or
+disable the lines that begin with @samp{##MAKE_@var{whatever}##}, as
+appropriate. Select the correct C compiler and @code{CFLAGS} for it.
+Then run @code{make}.
+
+Getting a correct configuration is likely to be an iterative process.
+Do not be discouraged if it takes you several tries. If you have no
+luck whatsoever, please report your system type, and the steps you took.
+Once you do have a working configuration, please send it to the maintainers
+so that support for your system can be added to the official release.
+
+@xref{Bugs}, for information on how to report problems in configuring
+@code{gawk}. You may also use the same mechanisms for sending in new
+configurations.@refill
+
+@node VMS Installation, MS-DOS Installation, Unix Installation, Installation
+@section Compiling, Installing, and Running @code{gawk} on VMS
+
+@c based on material from
+@c Pat Rankin <gawk.rankin@eql.caltech.edu>
+
+@cindex installation, vms
+This section describes how to compile and install @code{gawk} under VMS.
+
+@menu
+* VMS Compilation:: How to compile @code{gawk} under VMS.
+
+* VMS Installation Details:: How to install @code{gawk} under VMS.
+
+* VMS Running:: How to run @code{gawk} under VMS.
+@end menu
+
+@node VMS Compilation, VMS Installation Details, VMS Installation, VMS Installation
+@subsection Compiling @code{gawk} under VMS
+
+To compile @code{gawk} under VMS, there is a @code{DCL} command procedure that
+will issue all the necessary @code{CC} and @code{LINK} commands, and there is
+also a @file{Makefile} for use with the @code{MMS} utility. From the source
+directory, use either
+
+@example
+$ @@[.VMS]VMSBUILD.COM
+@end example
+
+@noindent
+or
+
+@example
+$ MMS/DESCRIPTION=[.VMS]DECSRIP.MMS GAWK
+@end example
+
+Depending upon which C compiler you are using, follow one of the sets
+of instructions in this table:
+
+@table @asis
+@item VAX C V3.x
+Use either @file{vmsbuild.com} or @file{descrip.mms} as is. These use
+@code{CC/OPTIMIZE=NOLINE}, which is essential for version 3.0.
+
+@item VAX C V2.x
+You must have version 2.3 or 2.4; older ones won't work. Edit either
+@file{vmsbuild.com} or @file{descrip.mms} according to the comments in them.
+For @file{vmsbuild.com}, this just entails removing two @samp{!} delimiters.
+Also edit @file{config.h} (which is a copy of file @file{[.config]vms-conf.h})
+and comment out or delete the two lines @samp{#define __STDC__ 0} and
+@samp{#define VAXC_BUILTINS} near the end.@refill
+
+@item GNU C
+Edit @file{vmsbuild.com} or @file{descrip.mms}; the changes are different
+from those for VAX C V2.x, but equally straightforward. No changes to
+@file{config.h} should be needed.
+@end table
+
+@code{gawk} 2.13 has been tested under VMS V5.3 and V5.4-2 using VAX C V3.2,
+V3.1, and V2.3 and also GNU C V1.39. It should work without modifications for
+VMS V4.6 and up.
+
+@node VMS Installation Details, VMS Running, VMS Compilation, VMS Installation
+@subsection Installing @code{gawk} on VMS
+
+To install @code{gawk}, all you need is a ``foreign'' command, which is
+a @code{DCL} symbol whose value begins with a dollar sign.
+
+@example
+$ GAWK :== $device:[directory]GAWK
+@end example
+
+@noindent
+(Substitute the actual location of @code{gawk.exe} for
+@samp{device:[directory]}.) The symbol should be placed in the
+@file{login.com} of any user who wishes to run @code{gawk},
+so that it will be defined every time the user logs on.
+Alternatively, the symbol may be placed in the system-wide
+@file{sylogin.com} procedure, which will allow all users
+to run @code{gawk}.@refill
+
+Optionally, the help entry can be loaded into a VMS help library:
+
+@example
+$ LIBRARY/HELP SYS$HELP:HELPLIB [.VMS]GAWK.HLP
+@end example
+
+@noindent
+(You may want to substitute a site-specific help library rather than
+the standard VMS library @samp{HELPLIB}.) After loading the help text,
+
+@example
+$ HELP GAWK
+@end example
+
+@noindent
+will provide information about both the @code{gawk} implementation and the
+@code{awk} programming language.
+
+The logical name @samp{AWK_LIBRARY} can designate a default location
+for @code{awk} program files. For the @samp{-f} option, if the specified
+filename has no device or directory path information in it, @code{gawk}
+will look in the current directory first, then in the directory specified
+by the translation of @samp{AWK_LIBRARY} if the file was not found.
+If after searching in both directories, the file still is not found,
+then @code{gawk} appends the suffix @samp{.awk} to the filename and the
+file search will be re-tried. If @samp{AWK_LIBRARY} is not defined, that
+portion of the file search will fail benignly.@refill
+
+@node VMS Running, , VMS Installation Details, VMS Installation
+@subsection Running @code{gawk} on VMS
+
+Command line parsing and quoting conventions are significantly different
+on VMS, so examples in this manual or from other sources often need minor
+changes. They @emph{are} minor though, and all @code{awk} programs
+should run correctly.
+
+Here are a couple of trivial tests:
+
+@example
+$ gawk -- "BEGIN @{print ""Hello, World!""@}"
+$ gawk -"W" version ! could also be -"W version" or "-W version"
+@end example
+
+@noindent
+Note that upper-case and mixed-case text must be quoted.
+
+The VMS port of @code{gawk} includes a @code{DCL}-style interface in addition
+to the original shell-style interface (see the help entry for details).
+One side-effect of dual command line parsing is that if there is only a
+single parameter (as in the quoted string program above), the command
+becomes ambiguous. To work around this, the normally optional @samp{--}
+flag is required to force Unix style rather than @code{DCL} parsing. If any
+other dash-type options (or multiple parameters such as data files to be
+processed) are present, there is no ambiguity and @samp{--} can be omitted.
+
+The default search path when looking for @code{awk} program files specified
+by the @samp{-f} option is @code{"SYS$DISK:[],AWK_LIBRARY:"}. The logical
+name @samp{AWKPATH} can be used to override this default. The format
+of @samp{AWKPATH} is a comma-separated list of directory specifications.
+When defining it, the value should be quoted so that it retains a single
+translation, and not a multi-translation @code{RMS} searchlist.
+
+@node MS-DOS Installation, Atari Installation, VMS Installation, Installation
+@section Installing @code{gawk} on MS-DOS
+
+@cindex installation, ms-dos
+The first step is to get all the files in the @code{gawk} distribution
+onto your PC. Move all the files from the @file{pc} directory into
+the main directory where the other files are. Edit the file
+@file{make.bat} so that it will be an acceptable MS-DOS batch file.
+This means making sure that all lines are terminated with ASCII
+Carriage Return and Line Feed characters. Rename the Unix file
+@file{awk.tab.c} to @file{awktab.c}, since MS-DOS has file naming
+restrictions.
+
+@code{gawk} has only been compiled with version 5.1 of the Microsoft
+C compiler. The file @file{make.bat} from the @file{pc} directory
+assumes that you have this compiler.
+
+Copy the file @file{setargv.obj} from the library directory where it
+resides to the @code{gawk} source code directory.
+
+Run @file{make.bat}. This will compile @code{gawk} for you, and link it.
+That's all there is to it!
+
+@node Atari Installation, , MS-DOS Installation, Installation
+@section Installing @code{gawk} on the Atari ST
+
+@c based on material from
+@c Michal Jaegermann <ntomczak@vm.ucs.ualberta.ca>
+
+@cindex installation, atari
+This section assumes that you are running TOS. It applies to other Atari
+models (STe, TT) as well.
+
+In order to use @code{gawk}, you need to have a shell, either text or
+graphics, that does not map all the characters of a command line to
+upper case. Maintaining case distinction in option flags is very
+important (@pxref{Command Line}). Popular shells like @code{gulam}
+or @code{gemini} will work, as will newer versions of @code{desktop}.
+Support for I/O redirection is necessary to make it easy to import
+@code{awk} programs from other environments. Pipes are nice to have,
+but not vital.
+
+If you have received an executable version of @code{gawk}, place it,
+as usual, anywhere in your @code{PATH} where your shell will find it.
+
+While executing, @code{gawk} creates a number of temporary files.
+@code{gawk} looks for either of the environment variables @code{TEMP}
+or @code{TMPDIR}, in that order. If either one is found, its value
+is assumed to be a directory for temporary files. This directory
+must exist, and if you can spare the memory, it is a good idea to
+put it on a @sc{RAM} drive. If neither @code{TEMP} nor @code{TMPDIR}
+are found, then @code{gawk} uses the current directory for its
+temporary files.
+
+The ST version of @code{gawk} searches for its program files as
+described in @ref{AWKPATH Variable}. On the ST, the default value for
+@code{AWKPATH} is @code{@w{".,c:\lib\awk,c:\gnu\lib\awk"}}.
+The search path can be modified by explicitly setting @code{AWKPATH} to
+whatever you wish. Note that colons cannot be used on the ST to separate
+elements in the @code{AWKPATH} variable, since they have another, reserved,
+meaning. Instead, you must use a comma to separate elements in the path.
+If you are recompiling @code{gawk} on the ST, then you can choose a new
+default search path, by setting the value of @samp{DEFPATH} in the file
+@file{...\config\atari}. You may choose a different separator character
+by setting the value of @samp{ENVSEP} in the same file. The new values will
+be used when creating the header file @file{config.h}.@refill
+
+@ignore
+As a last resort, small
+adjustments can be made directly on the executable version of @code{gawk}
+using a binary editor.@refill
+@end ignore
+
+Although @code{awk} allows great flexibility in doing I/O redirections
+from within a program, this facility should be used with care on the ST.
+In some circumstances the OS routines for file handle pool processing
+lose track of certain events, causing the computer to crash, and requiring
+a reboot. Often a warm reboot is sufficient. Fortunately, this happens
+infrequently, and in rather esoteric situations. In particular, avoid
+having one part of an @code{awk} program using @code{print}
+statements explicitly redirected to @code{"/dev/stdout"}, while other
+@code{print} statements use the default standard output, and a
+calling shell has redirected standard output to a file.@refill
+@c whew!
+
+When @code{gawk} is compiled with the ST version of @code{gcc} and its
+usual libraries, it will accept both @samp{/} and @samp{\} as path separators.
+While this is convenient, it should be remembered that this removes one,
+technically legal, character (@samp{/}) from your file names, and that
+it may create problems for external programs, called via the @code{system()}
+function, which may not support this convention. Whenever it is possible
+that a file created by @code{gawk} will be used by some other program,
+use only backslashes. Also remember that in @code{awk}, backslashes in
+strings have to be doubled in order to get literal backslashes.
+
+The initial port of @code{gawk} to the ST was done with @code{gcc}.
+If you wish to recompile @code{gawk} from scratch, you will need to use
+a compiler that accepts @sc{ANSI} standard C (such as @code{gcc}, Turbo C,
+or Prospero C). If @code{sizeof(int) != @w{sizeof(int *)}}, the correctness
+of the generated code depends heavily on the fact that all function calls
+have function prototypes in the current scope. If your compiler does
+not accept function prototypes, you will probably have to add a
+number of casts to the code.@refill
+
+If you are using @code{gcc}, make sure that you have up-to-date libraries.
+Older versions have problems with some library functions (@code{atan2()},
+@code{strftime()}, the @samp{%g} conversion in @code{sprintf()}) which
+may affect the operation of @code{gawk}.
+
+In the @file{atari} subdirectory of the @code{gawk} distribution is
+a version of the @code{system()} function that has been tested with
+@code{gulam} and @code{msh}; it should work with other shells as well.
+With @code{gulam}, it passes the string to be executed without spawning
+an extra copy of a shell. It is possible to replace this version of
+@code{system()} with a similar function from a library or from some other
+source if that version would be a better choice for the shell you prefer.
+
+The files needed to recompile @code{gawk} on the ST can be found in
+the @file{atari} directory. The provided files and instructions below
+assume that you have the GNU C compiler (@code{gcc}), the @code{gulam} shell,
+and an ST version of @code{sed}. The @file{Makefile} is set up to use
+@file{byacc} as a @file{yacc} replacement. With a different set of tools some
+adjustments and/or editing will be needed.@refill
+
+@code{cd} to the @file{atari} directory. Copy @file{Makefile.st} to
+@file{makefile} in the source (parent) directory. Possibly adjust
+@file{../config/atari} to suit your system. Execute the script @file{mkconf.g}
+which will create the header file @file{../config.h}. Go back to the source
+directory. If you are not using @code{gcc}, check the file @file{missing.c}.
+It may be necessary to change forward slashes in the references to files
+from the @file{atari} subdirectory into backslashes. Type @code{make} and
+enjoy.@refill
+
+Compilation with @code{gcc} of some of the bigger modules, like
+@file{awk_tab.c}, may require a full four megabytes of memory. On smaller
+machines you would need to cut down on optimizations, or you would have to
+switch to another, less memory hungry, compiler.@refill
+
+@node Gawk Summary, Sample Program, Installation, Top
+@appendix @code{gawk} Summary
+
This appendix provides a brief summary of the @code{gawk} command line and the
@code{awk} language. It is designed to serve as ``quick reference.'' It is
therefore terse, but complete.
@@ -7080,8 +8957,8 @@ values to be made available in the @code{ARGC} and @code{ARGV}
predefined @code{awk} variables:
@example
-awk @r{[@code{-F@var{fs}}] [@code{-v @var{var}=@var{val}}] [@code{-V}] [@code{-C}] [@code{-c}] [@code{-a}] [@code{-e}] [@code{--}]} '@var{program}' @var{file} @dots{}
-awk @r{[@code{-F@var{fs}}] @code{-f @var{source-file}} [@code{-f @var{source-file} @dots{}}] [@code{-v @var{var}=@var{val}}] [@code{-V}] [@code{-C}] [@code{-c}] [@code{-a}] [@code{-e}] [@code{--}]} @var{file} @dots{}
+awk @r{[@code{-F@var{fs}}] [@code{-W} @var{gawk-opts}] [@code{-v @var{var}=@var{val}}] [@code{--}]} '@var{program}' @var{file} @dots{}
+awk @r{[@code{-F@var{fs}}] [@code{-W} @var{gawk-opts}] [@code{-v @var{var}=@var{val}}]} @code{-f} @var{source-file} @r{[@code{-f @var{source-file} @dots{}}]} @var{file} @dots{}
@end example
The options that @code{gawk} accepts are:
@@ -7099,32 +8976,30 @@ of from the first command line argument.
Assign the variable @var{var} the value @var{val} before program execution
begins.
-@item -a
-Specifies use of traditional @code{awk} syntax for regular expressions.
-This means that @samp{\} can be used to quote regular expression
-operators inside of square brackets, just as it can be outside of them.
-
-@item -e
-Specifies use of @code{egrep} syntax for regular expressions. This
-means that @samp{\} does not serve as a quoting character inside of
-square brackets.
-
-@item -c
+@item -W compat
Specifies compatibility mode, in which @code{gawk} extensions are turned
off.
-@item -V
+@item -W posix
+Specifies @sc{POSIX} compatibility mode, in which @code{gawk} extensions
+are turned off, and additional restrictions apply.
+
+@item -W version
Print version information for this particular copy of @code{gawk} on the error
output. This option may disappear in a future version of @code{gawk}.
-@item -C
+@item -W copyleft
+@itemx -W copyright
Print the short version of the General Public License on the error
output. This option may disappear in a future version of @code{gawk}.
+@item -W lint
+Give warnings about dubious or non-portable @code{awk} constructs.
+
@item --
Signal the end of options. This is useful to allow further arguments to the
@code{awk} program itself to start with a @samp{-}. This is mainly for
-consistency with the argument parsing conventions of POSIX.
+consistency with the argument parsing conventions of @sc{POSIX}.
@end table
Any other options are flagged as invalid, but are otherwise ignored.
@@ -7151,16 +9026,14 @@ order they are specified. This is useful for building libraries of
@code{awk} functions, without having to include them in each new
@code{awk} program that uses them. To use a library function in a file
from a program typed in on the command line, specify @samp{-f /dev/tty};
-then type your program, and end it with a @kbd{C-d}. @xref{Command
-Line}.
+then type your program, and end it with a @kbd{Control-d}. @xref{Command Line}.
The environment variable @code{AWKPATH} specifies a search path to use
-when finding source files named with the @samp{-f} option. If the
-variable @code{AWKPATH} is not set, @code{gawk} uses the default path,
-@samp{.:/usr/lib/awk:/usr/local/lib/awk}. If a file name given to the
-@samp{-f} option contains a @samp{/} character, no path search is
-performed. @xref{AWKPATH Variable}, for a full description of the
-@code{AWKPATH} environment variable.@refill
+when finding source files named with the @samp{-f} option. The default path
+@samp{.:/usr/lib/awk:/usr/local/lib/awk} is used if @code{AWKPATH} is not set.
+If a file name given to the @samp{-f} option contains a @samp{/} character,
+no path search is performed. @xref{AWKPATH Variable}, for a full description
+of the @code{AWKPATH} environment variable.@refill
@code{gawk} compiles the program into an internal form, and then proceeds to
read each file named in the @code{ARGV} array. If there are no files named
@@ -7169,6 +9042,8 @@ on the command line, @code{gawk} reads the standard input.
If a ``file'' named on the command line has the form
@samp{@var{var}=@var{val}}, it is treated as a variable assignment: the
variable @var{var} is assigned the value @var{val}.
+If any of the files have a value that is the null string, that
+element in the list is skipped.@refill
For each line in the input, @code{gawk} tests to see if it matches any
@var{pattern} in the @code{awk} program. For each pattern that the line
@@ -7237,8 +9112,14 @@ The number of command line arguments (not including options or the
@item ARGV
The array of command line arguments. The array is indexed from 0 to
-@code{ARGC} - 1. Dynamically changing the contents of @code{ARGV} can control
-the files used for data.@refill
+@code{ARGC} @minus{} 1. Dynamically changing the contents of @code{ARGV}
+can control the files used for data.@refill
+
+@item CONVFMT
+The conversion format to use when converting numbers to strings.
+
+@item FIELDWIDTHS
+A space separated list of numbers describing the fixed-width input data.
@item ENVIRON
An array containing the values of the environment variables. The array
@@ -7278,7 +9159,8 @@ The number of fields in the current input record.
The total number of input records seen so far.
@item OFMT
-The output format for numbers, @code{"%.6g"} by default.
+The output format for numbers for the @code{print} statement,
+@code{"%.6g"} by default.
@item OFS
The output field separator, a blank by default.
@@ -7311,9 +9193,9 @@ default @code{"\034"}.
@appendixsubsec Arrays
Arrays are subscripted with an expression between square brackets
-(@samp{[} and @samp{]}). The expression may be either a number or
-a string. Since arrays are associative, string indices are meaningful
-and are not converted to numbers.
+(@samp{[} and @samp{]}). Array subscripts are @emph{always} strings;
+numbers are converted to strings as necessary, following the standard
+conversion rules (@pxref{Conversion}).@refill
If you use multiple expressions separated by commas inside the square
brackets, then the array subscript is a string consisting of the
@@ -7353,7 +9235,8 @@ of the string as a numeral. If the string does not look like a
numeral, it converts to 0.
Certain contexts (such as concatenation) require string values.
-They convert numbers to strings by effectively printing them.
+They convert numbers to strings by effectively printing them
+with @code{sprintf}. @xref{Conversion}, for the details.
To force conversion of a string value to a number, simply add 0
to it. If the value you start with is already a number, this
@@ -7363,8 +9246,9 @@ To force conversion of a numeric value to a string, concatenate it with
the null string.
The @code{awk} language defines comparisons as being done numerically if
-possible, otherwise one or both operands are converted to strings and
-a string comparison is performed.
+both operands are numeric, or if one is numeric and the other is a numeric
+string. Otherwise one or both operands are converted to strings and a
+string comparison is performed.
Uninitialized variables have the string value @code{""} (the null, or
empty, string). In contexts where a number is required, this is
@@ -7442,14 +9326,14 @@ patterns cannot be combined with other patterns in pattern expressions.
For @samp{/@var{regular-expression}/} patterns, the associated statement is
executed for each input line that matches the regular expression. Regular
-expressions are the same as those in @code{egrep}, and are summarized below.
+expressions are extensions of those in @code{egrep}, and are summarized below.
A @var{relational expression} may use any of the operators defined below in
the section on actions. These generally test whether certain fields match
certain regular expressions.
-The @samp{&&}, @samp{||}, and @samp{!} operators are logical ``and'',
-logical ``or'', and logical ``not'', respectively, as in C. They do
+The @samp{&&}, @samp{||}, and @samp{!} operators are logical ``and,''
+logical ``or,'' and logical ``not,'' respectively, as in C. They do
short-circuit evaluation, also as in C, and are used for combining more
primitive pattern expressions. As in most languages, parentheses may be
used to change the order of evaluation.
@@ -7538,6 +9422,7 @@ and input/output statements available are patterned after those in C.
* Special File Summary:: Special file names interpreted internally.
* Numeric Functions Summary:: Built-in numeric functions.
* String Functions Summary:: Built-in string functions.
+* Time Functions Summary:: Built-in time functions.
* String Constants Summary:: Escape sequences in strings.
@end menu
@@ -7583,7 +9468,7 @@ Unary plus, unary minus, and logical negation.
@item ^
Exponentiation (@samp{**} may also be used, and @samp{**=} for the assignment
-operator).
+operator, but they are not specified in the @sc{POSIX} standard).
@item ++ --
Increment and decrement, both prefix and postfix.
@@ -7658,7 +9543,7 @@ Format and print on @var{file}.
Other input/output redirections are also allowed. For @code{print} and
@code{printf}, @samp{>> @var{file}} appends output to the @var{file},
-while @samp{| @var{command}} writes on a pipe. In a similar fashion,
+and @samp{| @var{command}} writes on a pipe. In a similar fashion,
@samp{@var{command} | getline} pipes input into @code{getline}.
@code{getline} returns 0 on end of file, and @minus{}1 on an error.@refill
@@ -7680,11 +9565,9 @@ treated as a character and printed. Otherwise, the argument is assumed to
be a string, and the only first character of that string is printed.
@item %d
+@itemx %i
A decimal number (the integer part).
-@item %i
-Also a decimal integer.
-
@item %e
A floating point number of the form
@samp{@r{[}-@r{]}d.ddddddE@r{[}+-@r{]}dd}.@refill
@@ -7694,8 +9577,8 @@ A floating point number of the form
@r{[}@code{-}@r{]}@code{ddd.dddddd}.
@item %g
-Use @samp{%e} or @samp{%f} conversion, whichever is shorter, with
-nonsignificant zeros suppressed.
+Use @samp{%e} or @samp{%f} conversion, whichever produces a shorter string,
+with nonsignificant zeros suppressed.
@item %o
An unsigned octal number (again, an integer).
@@ -7730,6 +9613,10 @@ A number indicating the maximum width of strings or digits to the right
of the decimal point.
@end table
+Either or both of the @var{width} and @var{prec} values may be specified
+as @samp{*}. In that case, the particular value is taken from the argument
+list.
+
@xref{Printf}, for examples and for a more detailed description.
@node Special File Summary, Numeric Functions Summary, Printf Summary, Actions Summary
@@ -7797,7 +9684,7 @@ is provided, the time of day is used. The return value is the previous
seed for the random number generator.
@end table
-@node String Functions Summary, String Constants Summary, Numeric Functions Summary, Actions Summary
+@node String Functions Summary, Time Functions Summary, Numeric Functions Summary, Actions Summary
@appendixsubsubsec String Functions
@code{awk} has the following predefined string functions:
@@ -7813,7 +9700,8 @@ returns the index of the string @var{t} in the string @var{s}, or 0 if
@var{t} is not present.
@item length(@var{s})
-returns the length of the string @var{s}.
+returns the length of the string @var{s}. The length of @code{$0}
+is returned if no argument is supplied.
@item match(@var{s}, @var{r})
returns the position in @var{s} where the regular expression @var{r}
@@ -7850,9 +9738,29 @@ Nonalphabetic characters are left unchanged.
Execute the command @var{cmd-line}, and return the exit status.
@end table
+@node Time Functions Summary, String Constants Summary, String Functions Summary, Actions Summary
+@appendixsubsubsec Built-in time functions
+
+The following two functions are available for getting the current
+time of day, and for formatting time stamps.
+
+@table @code
+@item systime
+returns the current time of day as the number of seconds since a particular
+epoch (Midnight, January 1, 1970 UTC, on @sc{POSIX} systems).
+
+@item strftime(@var{format}, @var{timestamp})
+formats @var{timestamp} according to the specification in @var{format}.
+The current time of day is used if no @var{timestamp} is supplied.
+@xref{Time Functions}, for the details on the conversion specifiers
+that @code{strftime} accepts.
+@end table
+
+@iftex
@xref{Built-in}, for a description of all of @code{awk}'s built-in functions.
+@end iftex
-@node String Constants Summary, , String Functions Summary, Actions Summary
+@node String Constants Summary, , Time Functions Summary, Actions Summary
@appendixsubsubsec String Constants
String constants in @code{awk} are sequences of characters enclosed
@@ -7886,10 +9794,11 @@ Vertical tab.
@item \x@var{hex digits}
The character represented by the string of hexadecimal digits following
-the @samp{\x}. As in ANSI C, all following hexadecimal digits are
+the @samp{\x}. As in @sc{ANSI} C, all following hexadecimal digits are
considered part of the escape sequence. (This feature should tell us
something about language design by committee.) E.g., @code{"\x1B"} is a
-string containing the ASCII ESC (escape) character.
+string containing the ASCII ESC (escape) character. (The @samp{\x}
+escape sequence is not in @sc{POSIX} @code{awk}.)
@item \@var{ddd}
The character represented by the 1-, 2-, or 3-digit sequence of octal
@@ -7923,15 +9832,18 @@ If there are fewer arguments passed than there are names in @var{parameter-list}
the extra names are given the null string as value. Extra names have the
effect of local variables.
-The open-parenthesis in a function call must immediately follow the
-function name, without any intervening white space. This is to avoid a
-syntactic ambiguity with the concatenation operator.
+The open-parenthesis in a function call of a user-defined function must
+immediately follow the function name, without any intervening white space.
+This is to avoid a syntactic ambiguity with the concatenation operator.
-The word @code{func} may be used in place of @code{function}.
+The word @code{func} may be used in place of @code{function} (but not in
+@sc{POSIX} @code{awk}).
+
+Use the @code{return} statement to return a value from a function.
@xref{User-defined}, for a more complete description.
-@node Sample Program, Notes, Gawk Summary, Top
+@node Sample Program, Bugs, Gawk Summary, Top
@appendix Sample Program
The following example is a complete @code{awk} program, which prints
@@ -7992,38 +9904,64 @@ interested in which words occur most frequently, or having an alphabetized
table of how frequently each word occurs.@refill
@end itemize
-The way to solve these problems is to use other system utilities to
-process the input and output of the @code{awk} script. Suppose the
-script shown above is saved in the file @file{frequency.awk}. Then the
-shell command:@refill
+The way to solve these problems is to use some of the more advanced
+features of the @code{awk} language. First, we use @code{tolower} to remove
+case distinctions. Next, we use @code{gsub} to remove punctuation
+characters. Finally, we the system @code{sort} utility to process the
+output of the @code{awk} script. First, here is the new version of
+the program:@refill
@example
-tr A-Z a-z < file1 | tr -cd 'a-z\012' \
- | awk -f frequency.awk \
- | sort +1 -nr
+awk '
+# Print list of word frequencies
+@{
+ $0 = tolower($0) # remove case distinctions
+ gsub(/[^a-z0-9_ \t]/, "", $0) # remove punctuation
+ for (i = 1; i <= NF; i++)
+ freq[$i]++
+@}
+
+END @{
+ for (word in freq)
+ printf "%s\t%d\n", word, freq[word]
+@}'
+@end example
+
+Assuming we have saved this program in a file named @file{frequency.awk},
+and that the data is in @file{file1}, the following pipeline
+
+@example
+awk -f frequency.awk file1 | sort +1 -nr
@end example
@noindent
produces a table of the words appearing in @file{file1} in order of
decreasing frequency.
-The first @code{tr} command in this pipeline translates all the upper case
-characters in @file{file1} to lower case. The second @code{tr} command
-deletes all the characters in the input except lower case characters and
-newlines. The second argument to the second @code{tr} is quoted to protect
-the backslash in it from being interpreted by the shell. The @code{awk}
-program reads this suitably massaged data and produces a word frequency
-table, which is not ordered.
+The @code{awk} program suitably massages the data and produces a word
+frequency table, which is not ordered.
-The @code{awk} script's output is now sorted by the @code{sort} command and
+The @code{awk} script's output is then sorted by the @code{sort} command and
printed on the terminal. The options given to @code{sort} in this example
specify to sort by the second field of each input line (skipping one field),
that the sort keys should be treated as numeric quantities (otherwise
@samp{15} would come before @samp{5}), and that the sorting should be done
in descending (reverse) order.@refill
+We could have even done the @code{sort} from within the program, by
+changing the @code{END} action to:
+
+@example
+END @{
+ sort = "sort +1 -nr"
+ for (word in freq)
+ printf "%s\t%d\n", word, freq[word] | sort
+ close(sort)
+@}'
+@end example
+
See the general operating system documentation for more information on how
-to use the @code{tr} and @code{sort} commands.@refill
+to use the @code{sort} command.@refill
@ignore
@strong{ADR: I have some more substantial programs courtesy of Rick Adams
@@ -8032,9 +9970,48 @@ instead of this program.}
@strong{I would also like to incorporate the general @code{translate}
function that I have written.}
+
+@strong{I have a ton of other sample programs to include too.}
@end ignore
-@node Notes, Glossary, Sample Program, Top
+@node Bugs, Notes, Sample Program, Top
+@appendix Reporting Problems and Bugs
+
+@c This chapter stolen shamelessly from the GNU m4 manual.
+
+If you have problems with @code{gawk} or think that you have found a bug,
+please report it to the developers; we don't promise to do anything
+but we might well want to fix it.
+
+Before reporting a bug, make sure you have actually found a real bug.
+Carefully reread the documentation and see if it really says you can do
+what you're trying to do. If it's not clear whether you should be able
+to do something or not, report that too; it's a bug in the documentation!
+
+Before reporting a bug or trying to fix it yourself, try to isolate it
+to the smallest possible @code{awk} program and input data file that
+reproduces the problem. Then send us the program and data file,
+some idea of what kind of Unix system you're using, and the exact results
+@code{gawk} gave you. Also say what you expected to occur; this will help
+us decide whether the problem was really in the documentation.
+
+Once you have a precise problem, send e-mail to (Internet)
+@samp{bug-gnu-utils@@prep.ai.mit.edu} or (UUCP)
+@samp{mit-eddie!prep.ai.mit.edu!bug-gnu-utils}. Please include the
+version number of @code{gawk} you are using. You can get this information
+with the command @samp{gawk -W version '@{@}' /dev/null}.
+You should send carbon copies of your mail to David Trueman at
+@samp{david@@cs.dal.ca}, and to Arnold Robbins, at
+@samp{arnold@@skeeve.atl.ga.us}. David is most likely to fix code
+problems, while Arnold is most likely to fix documentation problems.@refill
+
+Non-bug suggestions are always welcome as well. If you have questions
+about things that are unclear in the documentation or are just obscure
+features, ask Arnold Robbins; he'll be happy to help you out (but no
+promises). You can send him electronic mail at the Internet address
+above.
+
+@node Notes, Glossary, Bugs, Top
@appendix Implementation Notes
This appendix contains information mainly of interest to implementors and
@@ -8052,42 +10029,31 @@ maintainers of @code{gawk}. Everything in it applies specifically to
@node Compatibility Mode, Future Extensions, Notes, Notes
@appendixsec Downwards Compatibility and Debugging
-@xref{S5R4/GNU}, for a summary of the GNU extensions to the @code{awk}
-language and program. All of these features can be turned off either by
-compiling @code{gawk} with @samp{-DSTRICT} (not recommended), or by
-invoking @code{gawk} with the @samp{-c} option.@refill
+@xref{POSIX/GNU}, for a summary of the GNU extensions to the @code{awk}
+language and program. All of these features can be turned off by
+invoking @code{gawk} with the @samp{-W compat} option, or with the
+@samp{-W posix} option.@refill
If @code{gawk} is compiled for debugging with @samp{-DDEBUG}, then there
-are two more options available on the command line.
+is one more option available on the command line:
@table @samp
-@item -d
-Print out debugging information during execution.
-
-@item -D
+@item -W debug
Print out the parse stack information as the program is being parsed.
@end table
-Both of these options are intended only for serious @code{gawk} developers,
-and not for the casual user. They probably have not even been compiled into
-your version of @code{gawk}, since they slow down execution.
-
-The code for recognizing special file names such as @file{/dev/stdin}
-can be disabled at compile time with @samp{-DNO_DEV_FD}, or with
-@samp{-DSTRICT}.@refill
+This option is intended only for serious @code{gawk} developers,
+and not for the casual user. It probably has not even been compiled into
+your version of @code{gawk}, since it slows down execution.
@node Future Extensions, Improvements, Compatibility Mode, Notes
@appendixsec Probable Future Extensions
This section briefly lists extensions that indicate the directions we are
-currently considering for @code{gawk}.
+currently considering for @code{gawk}. The file @file{FUTURES} in the
+@code{gawk} distributions lists these extensions, as well as several others.
@table @asis
-@item ANSI C compatible @code{printf}
-The @code{printf} and @code{sprintf} functions may be enhanced to be
-fully compatible with the specification for the @code{printf} family
-of functions in ANSI C.@refill
-
@item @code{RS} as a regexp
The meaning of @code{RS} may be generalized along the lines of @code{FS}.
@@ -8096,7 +10062,7 @@ Changes made in @code{gawk} to the array @code{ENVIRON} may be
propagated to subprocesses run by @code{gawk}.
@item Data bases
-It may be possible to map an NDBM/GDBM file into an @code{awk} array.
+It may be possible to map a GDBM/NDBM/SDBM file into an @code{awk} array.
@item Single-character fields
The null string, @code{""}, as a field separator, will cause field
@@ -8104,14 +10070,29 @@ splitting and the split function to separate individual characters.
Thus, @code{split(a, "abcd", "")} would yield @code{a[1] == "a"},
@code{a[2] == "b"}, and so on.
-@item Fixed-length fields and records
-A mechanism may be provided to allow the specification of fixed length
-fields and records.
+@item More @code{lint} warnings
+There are more things that could be checked for portability.
+
+@item @code{RECLEN} variable for fixed length records
+Along with @code{FIELDWIDTHS}, this would speed up the processing of
+fixed-length records.
+
+@item A @code{restart} keyword
+After modifying @code{$0}, @code{restart} would restart the pattern
+matching loop, without reading a new record from the input.
+
+@item A @code{nextfile} keyword
+This would be like @code{next}, but instead of abandoning the current
+input record, it would abandon the entire current input file.
-@item Regexp syntax
-The @code{egrep} syntax for regular expressions, now specified
-with the @samp{-e} option, may become the default, since the
-POSIX standard may specify this.
+@item A @samp{|&} redirection
+The @samp{|&} redirection, in place of @samp{|}, would open a two-way
+pipeline for communication with a sub-process (via @code{getline} and
+@code{print} and @code{printf}).
+
+@item @code{IGNORECASE} affecting all comparisons
+The effects of the @code{IGNORECASE} variable may be generalized to
+all string comparisons, and not just regular expression operations.
@c this is @emph{very} long term --- not worth including right now.
@ignore
@@ -8134,21 +10115,11 @@ project.@refill
@enumerate
@item
-State machine regexp matcher: At present, @code{gawk} uses the
-backtracking regular expression matcher from the GNU subroutine library.
-If a regexp is really going to be used a lot of times, it is faster to
-convert it once to a description of a finite state machine, then run a
-routine simulating that machine every time you want to match the regexp.
-You might be able to use the matching routines used by GNU @code{egrep}.
-
-@item
Compilation of @code{awk} programs: @code{gawk} uses a Bison (YACC-like)
parser to convert the script given it into a syntax tree; the syntax
tree is then executed by a simple recursive evaluator. Both of these
-steps incur a lot of overhead, since parsing can be slow (especially if
-you also do the previous project and convert regular expressions to
-finite state machines at compile time) and the recursive evaluator
-performs many procedure calls to do even the simplest things.@refill
+steps incur a lot of overhead, since parsing can be slow and the recursive
+evaluator performs many procedure calls to do even the simplest things.@refill
It should be possible for @code{gawk} to convert the script's parse tree
into a C program which the user would then compile, using the normal
@@ -8163,10 +10134,23 @@ a straight line byte code interpreter that would be intermediate in speed
between running a compiled program and doing what @code{gawk} does
now.@refill
+This may actually happen for the 3.0 version of @code{gawk}.
+
@item
An error message section has not been included in this version of the
manual. Perhaps some nice beta testers will document some of the messages
for the future.
+
+@item
+The programs in the test suite could use documenting in this manual.
+
+@item
+The programs and data files in the manual should be available in
+separate files to facilitate experimentation.
+
+@item
+See the @file{FUTURES} file for more ideas. Contact us if you would
+seriously like to tackle any of the items listed there.
@end enumerate
@node Glossary, Index , Notes, Top
@@ -8183,8 +10167,12 @@ rule's action. Actions are always enclosed in curly braces.
Henry Spencer at the University of Toronto wrote a retargetable assembler
completely as @code{awk} scripts. It is thousands of lines long, including
machine descriptions for several 8-bit microcomputers. It is distributed
-with @code{gawk} and is a good example of a program that would have been
-better written in another language.@refill
+with @code{gawk} (as part of the test suite) and is a good example of a
+program that would have been better written in another language.@refill
+
+@item @sc{ANSI}
+The American National Standards Institute. This organization produces
+many standards, among them the standard for the C programming language.
@item Assignment
An @code{awk} expression that changes the value of some @code{awk}
@@ -8205,22 +10193,31 @@ Another name for an @code{awk} program.
@item Built-in Function
The @code{awk} language provides built-in functions that perform various
-numerical and string computations. Examples are @code{sqrt} (for the
-square root of a number) and @code{substr} (for a substring of a
-string). @xref{Built-in}.@refill
+numerical, time stamp related, and string computations. Examples are
+@code{sqrt} (for the square root of a number) and @code{substr} (for a
+substring of a string). @xref{Built-in}.@refill
@item Built-in Variable
-The variables @code{ARGC}, @code{ARGV}, @code{ENVIRON}, @code{FILENAME},
-@code{FNR}, @code{FS}, @code{NF}, @code{IGNORECASE}, @code{NR}, @code{OFMT},
-@code{OFS}, @code{ORS}, @code{RLENGTH}, @code{RSTART}, @code{RS}, and
-@code{SUBSEP}, have special meaning to @code{awk}. Changing some of them
-affects @code{awk}'s running environment. @xref{Built-in Variables}.@refill
+The variables @code{ARGC}, @code{ARGV}, @code{CONVFMT}, @code{FIELDWIDTHS},
+@code{ENVIRON}, @code{FILENAME}, @code{FNR}, @code{FS}, @code{IGNORECASE},
+@code{NF}, @code{NR}, @code{OFMT}, @code{OFS}, @code{ORS},
+@code{RLENGTH}, @code{RSTART}, @code{RS}, and @code{SUBSEP}, have special
+meaning to @code{awk}. Changing some of them affects @code{awk}'s running
+environment. @xref{Built-in Variables}.@refill
+
+@item Braces
+See ``Curly Braces.''
@item C
The system programming language that most GNU software is written in. The
@code{awk} programming language has C-like syntax, and this manual
points out similarities between @code{awk} and C when appropriate.@refill
+@item CHEM
+A preprocessor for @code{pic} that reads descriptions of molecules
+and produces @code{pic} input for drawing them. It was written by
+Brian Kernighan, and is distributed with the @code{gawk} test suite.@refill
+
@item Compound Statement
A series of @code{awk} statements, enclosed in curly braces. Compound
statements may be nested. @xref{Statements}.@refill
@@ -8247,8 +10244,8 @@ its execution. @xref{Regexp Usage}.
@item Comparison Expression
A relation that is either true or false, such as @code{(a < b)}.
-Comparison expressions are used in @code{if} and @code{while} statements,
-and in patterns to select which input records to process.
+Comparison expressions are used in @code{if}, @code{while}, and @code{for}
+statements, and in patterns to select which input records to process.
@xref{Comparison Ops}.@refill
@item Curly Braces
@@ -8275,13 +10272,14 @@ ESC (escape) character. @xref{Constants}.
When @code{awk} reads an input record, it splits the record into pieces
separated by whitespace (or by a separator regexp which you can
change by setting the built-in variable @code{FS}). Such pieces are
-called fields. @xref{Records}.@refill
+called fields. If the pieces are of fixed length, you can use the built-in
+variable @code{FIELDWIDTHS} to describe their lengths. @xref{Records}.@refill
@item Format
Format strings are used to control the appearance of output in the
@code{printf} statement. Also, data conversions from numbers to strings
are controlled by the format string contained in the built-in variable
-@code{OFMT}. @xref{Control Letters}; also @pxref{Output Separators}.@refill
+@code{CONVFMT}. @xref{Control Letters}.@refill
@item Function
A specialized group of statements often used to encapsulate general
@@ -8300,7 +10298,7 @@ record consists of one line of text. @xref{Records}.@refill
In the @code{awk} language, a keyword is a word that has special
meaning. Keywords are reserved and may not be used as variable names.
-The keywords of @code{awk} are:
+@code{awk}'s keywords are:
@code{if},
@code{else},
@code{while},
@@ -8334,6 +10332,12 @@ tested. If the condition is satisfied, the pattern is said to @dfn{match}
the input record. A typical pattern might compare the input record against
a regular expression. @xref{Patterns}.@refill
+@item @sc{POSIX}
+The name for a series of standards being developed by the @sc{IEEE}
+that specify a Portable Operating System interface. The ``IX'' denotes
+the Unix heritage of these standards. The main standard of interest for
+@code{awk} users is P1003.2, the Command Language and Utilities standard.
+
@item Range (of input lines)
A sequence of consecutive lines from the input file. A pattern
can specify ranges of input lines for @code{awk} to process, or it can
@@ -8341,7 +10345,7 @@ specify single lines. @xref{Patterns}.@refill
@item Recursion
When a function calls itself, either directly or indirectly.
-If this isn't clear, refer to the entry for ``recursion''.
+If this isn't clear, refer to the entry for ``recursion.''
@item Redirection
Redirection means performing input from other than the standard input
@@ -8353,7 +10357,7 @@ operators. You can redirect input to the @code{getline} statement using
the @samp{<} and @samp{|} operators. @xref{Redirection}.@refill
@item Regular Expression
-See ``regexp''.
+See ``regexp.''
@item Regexp
Short for @dfn{regular expression}. A regexp is a pattern that denotes a
@@ -8390,7 +10394,7 @@ user.@refill
@item String
A datum consisting of a sequence of characters, such as @samp{I am a
string}. Constant strings are written with double-quotes in the
-@code{awk} language, and may contain @dfn{escape sequences}.
+@code{awk} language, and may contain escape sequences.
@xref{Constants}.
@item Whitespace
@@ -8405,3 +10409,19 @@ string.@refill
@summarycontents
@contents
@bye
+
+Unresolved Issues:
+------------------
+1. From: ntomczak@vm.ucs.ualberta.ca
+ Examples of usage tend to suggest that /../ and ".." delimiters
+ can be used for regular expressions, even if definition is consistently
+ using /../. I am not sure what the real rules are and in particular
+ what of the following is a bug and what is a feature:
+ # This program matches everything
+ '"\(" { print }'
+ # This one complains about mismatched parenthesis
+ '$0 ~ "\(" { print }'
+ # This one behaves in an expected manner
+ '/\(/ { print }'
+ You may also try to use "\(" as an argument to match() to see what
+ will happen.
diff --git a/io.c b/io.c
index 02852f15..163dc19b 100644
--- a/io.c
+++ b/io.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -36,7 +36,7 @@
#endif
static IOBUF *nextfile P((void));
-static int inrec P((IOBUF *iop, int getline_redirect));
+static int inrec P((IOBUF *iop));
static int iop_close P((IOBUF *iop));
struct redirect *redirect P((NODE *tree, int *errflg));
static void close_one P((void));
@@ -63,9 +63,14 @@ nextfile()
static int files = 0;
char *arg;
int fd = INVALID_HANDLE;
+ static IOBUF *curfile = NULL;
- if (curfile != NULL && curfile->cnt != EOF)
- return curfile;
+ if (curfile != NULL) {
+ if (curfile->cnt == EOF)
+ (void) iop_close(curfile);
+ else
+ return curfile;
+ }
for (; i < (int) (ARGC_node->lnode->numbr); i++) {
arg = (*assoc_lookup(ARGV_node, tmp_number((AWKNUM) i)))->stptr;
if (*arg == '\0')
@@ -94,7 +99,7 @@ nextfile()
fd = 0;
}
if (fd == INVALID_HANDLE)
- return NULL;
+ return curfile = NULL;
return curfile = iop_alloc(fd);
}
@@ -114,9 +119,8 @@ set_NR()
* This reads in a record from the input file
*/
static int
-inrec(iop, getline_redirect)
+inrec(iop)
IOBUF *iop;
-int getline_redirect;
{
char *begin;
register int cnt;
@@ -126,7 +130,7 @@ int getline_redirect;
if (cnt == EOF) {
cnt = 0;
retval = 1;
- } else if (!getline_redirect) {
+ } else {
NR += 1;
FNR += 1;
}
@@ -145,17 +149,17 @@ IOBUF *iop;
return 0;
errno = 0;
- /* Work around bug in UNICOS popen, but it shouldn't hurt elsewhere */
+#ifdef _CRAY
+ /* Work around bug in UNICOS popen */
if (iop->fd < 3)
ret = 0;
else
- ret = close(iop->fd);
+#endif
+ ret = close(iop->fd);
if (ret == -1)
warning("close of fd %d failed (%s)", iop->fd, strerror(errno));
- free(iop->buf);
- free(iop->secbuf);
- if (iop == curfile)
- curfile = NULL; /* kludge -- gotta do better */
+ if (iop->buf)
+ free(iop->buf);
free((char *)iop);
return ret == -1 ? 1 : 0;
}
@@ -167,11 +171,9 @@ do_input()
extern int exiting;
while ((iop = nextfile()) != NULL) {
- if (inrec(iop, 0) == 0)
- while (interpret(expression_value) && inrec(iop, 0) == 0)
+ if (inrec(iop) == 0)
+ while (interpret(expression_value) && inrec(iop) == 0)
;
- (void) iop_close(iop);
- iop = NULL;
if (exiting)
break;
}
@@ -492,11 +494,11 @@ char *name, *mode;
cp = name + 5;
/* XXX - first three tests ignore mode */
- if (STREQ(cp, "stdin") && (flag & O_RDONLY))
+ if (STREQ(cp, "stdin") && (flag & O_RDONLY) == O_RDONLY)
openfd = fileno(stdin);
- else if (STREQ(cp, "stdout") && (flag & O_WRONLY))
+ else if (STREQ(cp, "stdout") && (flag & O_WRONLY) == O_WRONLY)
openfd = fileno(stdout);
- else if (STREQ(cp, "stderr") && (flag & O_WRONLY))
+ else if (STREQ(cp, "stderr") && (flag & O_WRONLY) == O_WRONLY)
openfd = fileno(stderr);
else if (STREQN(cp, "fd/", 3)) {
cp += 3;
@@ -700,47 +702,48 @@ NODE *tree;
{
struct redirect *rp = NULL;
IOBUF *iop;
- int cnt;
- NODE **lhs;
- int redir_error = 0;
- int getline_redirect = 0;
-
- if (tree->rnode == NULL) { /* no redirection */
- iop = nextfile();
- if (iop == NULL) /* end of input */
- return tmp_number((AWKNUM) 0.0);
- } else {
- rp = redirect(tree->rnode, &redir_error);
- if (rp == NULL && redir_error) /* failed redirect */
- return tmp_number((AWKNUM) -1.0);
- iop = rp->iop;
- getline_redirect++;
- }
- if (tree->lnode == NULL) { /* no optional var. -- read in $0 */
- if (inrec(iop, getline_redirect) != 0)
- return tmp_number((AWKNUM) 0.0);
- } else { /* read in a named variable */
- char *s = NULL;
- Func_ptr after_assign = NULL;
-
- lhs = get_lhs(tree->lnode, &after_assign);
- cnt = get_a_record(&s, iop, *RS);
- if (!getline_redirect) {
- NR += 1;
- FNR += 1;
+ int cnt = EOF;
+ char *s = NULL;
+
+ while (cnt == EOF) {
+ if (tree->rnode == NULL) { /* no redirection */
+ iop = nextfile();
+ if (iop == NULL) /* end of input */
+ return tmp_number((AWKNUM) 0.0);
+ } else {
+ int redir_error = 0;
+
+ rp = redirect(tree->rnode, &redir_error);
+ if (rp == NULL && redir_error) /* failed redirect */
+ return tmp_number((AWKNUM) -1.0);
+ iop = rp->iop;
}
+ cnt = get_a_record(&s, iop, *RS);
if (cnt == EOF) {
if (rp) {
(void) iop_close(iop);
rp->iop = NULL;
- }
- return tmp_number((AWKNUM) 0.0);
+ return tmp_number((AWKNUM) 0.0);
+ } else
+ continue; /* try another file */
+ }
+ if (!rp) {
+ NR += 1;
+ FNR += 1;
+ }
+ if (tree->lnode == NULL) /* no optional var. */
+ set_record(s, cnt, 1);
+ else { /* assignment to variable */
+ Func_ptr after_assign = NULL;
+ NODE **lhs;
+
+ lhs = get_lhs(tree->lnode, &after_assign);
+ unref(*lhs);
+ *lhs = make_string(s, strlen(s));
+ /* we may have to regenerate $0 here! */
+ if (after_assign)
+ (*after_assign)();
}
- unref(*lhs);
- *lhs = make_string(s, strlen(s));
- /* we may have to regenerate $0 here! */
- if (after_assign)
- (*after_assign)();
}
return tmp_number((AWKNUM) 1.0);
}
diff --git a/iop.c b/iop.c
index dae43f42..38bd29aa 100644
--- a/iop.c
+++ b/iop.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -36,6 +36,14 @@
#ifdef TEST
int bufsize = 8192;
+
+void
+fatal(s)
+char *s;
+{
+ printf("%s\n", s);
+ exit(1);
+}
#endif
int
@@ -62,7 +70,7 @@ int fd;
#ifdef TEST
return bufsize;
-#endif
+#else
#ifndef atarist
if (isatty(fd))
#else
@@ -78,6 +86,7 @@ int fd;
if (lseek(fd, 0L, 0) == -1)
return DEFBLKSIZE;
return (stb.st_size < DEFBLKSIZE ? stb.st_size : DEFBLKSIZE);
+#endif /*! TEST */
#endif /*! VMS */
}
@@ -94,125 +103,164 @@ int fd;
if (isatty(fd))
iop->flag |= IOP_IS_TTY;
iop->size = optimal_bufsize(fd);
+ iop->secsiz = -2;
errno = 0;
iop->fd = fd;
- emalloc(iop->buf, char *, iop->size + 2, "iop_alloc");
- iop->end = iop->off = iop->buf;
- iop->secsiz = iop->size < BUFSIZ ? iop->size : BUFSIZ;
- emalloc(iop->secbuf, char *, iop->secsiz+2, "iop_alloc");
- iop->cnt = -1;
+ iop->off = iop->buf = NULL;
+ iop->cnt = 0;
return iop;
}
+/*
+ * Get the next record. Uses a "split buffer" where the latter part is
+ * the normal read buffer and the head part is an "overflow" area that is used
+ * when a record spans the end of the normal buffer, in which case the first
+ * part of the record is copied into the overflow area just before the
+ * normal buffer. Thus, the eventual full record can be returned as a
+ * contiguous area of memory with a minimum of copying. The overflow area
+ * is expanded as needed, so that records are unlimited in length.
+ * We also mark both the end of the buffer and the end of the read() with
+ * a sentinel character (the current record separator) so that the inside
+ * loop can run as a single test.
+ */
int
-get_a_record(out, iop, rs)
+get_a_record(out, iop, RS)
char **out;
IOBUF *iop;
-register int rs;
+register int RS;
{
register char *bp = iop->off;
- register char *end_data = iop->end; /* end of current data read */
- char *end_buf = iop->buf + iop->size; /* end of input buffer */
+ char *bufend;
char *start = iop->off; /* beginning of record */
- char *offset = iop->secbuf; /* end of data in secbuf */
- size_t size;
+#ifdef atarist
+#define P_DIFF ptrdiff_t
+#else
+#define P_DIFF size_t
+#endif
+ P_DIFF len;
+ int saw_newline;
+ char rs;
+ int eat_whitespace;
- if (iop->cnt == 0)
+ if (iop->cnt == EOF) /* previous read hit EOF */
return EOF;
- /* set up sentinels */
- if (rs == 0) {
- *end_data = *(end_data+1) = '\n';
- *end_buf = *(end_buf+1) = '\n';
+ if (RS == 0) { /* special case: RS == "" */
+ rs = '\n';
+ eat_whitespace = 0;
+ saw_newline = 0;
+ } else
+ rs = RS;
+
+ /* set up sentinel */
+ if (iop->buf) {
+ bufend = iop->buf + iop->size + iop->secsiz;
+ *bufend = rs;
} else
- *end_data = *end_buf = rs;
+ bufend = NULL;
for (;;) { /* break on end of record, read error or EOF */
- if (bp == end_data) {
- if (bp == end_buf) { /* record spans buffer end */
-#ifdef atarist
-#define P_DIFF ptrdiff_t
-#else
-#define P_DIFF int
-#endif
-#define COPY_TO_SECBUF { \
- P_DIFF oldlen = offset - iop->secbuf; \
- P_DIFF newlen = bp - start; \
- \
- if (iop->secsiz < oldlen + newlen) { \
- erealloc(iop->secbuf, char *, \
- oldlen+newlen, "get_record"); \
- offset = iop->secbuf + oldlen; \
- } \
- memcpy(offset, start, newlen); \
- offset += newlen; \
+ /* Following code is entered on the first call of this routine
+ * for a new iop, or when we scan to the end of the buffer.
+ * In the latter case, we copy the current partial record to
+ * the space preceding the normal read buffer. If necessary,
+ * we expand this space. This is done so that we can return
+ * the record as a contiguous area of memory.
+ */
+ if (bp >= bufend) {
+ char *oldbuf = NULL;
+ char *oldsplit = iop->buf + iop->secsiz;
+
+ len = bp - start;
+ if (len > iop->secsiz) {
+ if (iop->secsiz == -2)
+ iop->secsiz = 256;
+ while (len > iop->secsiz)
+ iop->secsiz *= 2;
+ oldbuf = iop->buf;
+ emalloc(iop->buf, char *,
+ iop->size+iop->secsiz+2, "get_a_record");
+ bufend = iop->buf + iop->size + iop->secsiz;
+ *bufend = rs;
}
- COPY_TO_SECBUF
- start = bp = iop->buf;
- size = iop->size;
- } else
- size = end_buf - bp;
- iop->cnt = read(iop->fd, bp, size);
+ if (len) {
+ char *newsplit = iop->buf + iop->secsiz;
+
+ if (start < oldsplit) {
+ memcpy(newsplit - len, start, oldsplit - start);
+ memcpy(newsplit - (bp - oldsplit), oldsplit, bp - oldsplit);
+ } else
+ memcpy(newsplit - len, start, len);
+ }
+ bp = iop->end = iop->off = iop->buf + iop->secsiz;
+ start = bp - len;
+ if (oldbuf) {
+ free(oldbuf);
+ oldbuf = NULL;
+ }
+ }
+ /* Following code is entered whenever we have no more data to
+ * scan. In most cases this will read into the beginning of
+ * the main buffer, but in some cases (terminal, pipe etc.)
+ * we may be doing smallish reads into more advanced positions.
+ */
+ if (bp >= iop->end) {
+ iop->cnt = read(iop->fd, iop->end, bufend - iop->end);
if (iop->cnt == -1)
fatal("error reading input");
else if (iop->cnt == 0) {
+ iop->cnt = EOF;
break;
- } else {
- end_data = bp + iop->cnt;
- if (rs == 0 && *bp == '\n'
- && offset > iop->secbuf
- && *(offset-1) == '\n') {
- bp++;
- break;
- }
- if (rs == 0) {
- *end_data = *(end_data+1) = '\n';
- *end_buf = *(end_buf+1) = '\n';
- } else
- *end_data = rs;
}
+ iop->end += iop->cnt;
+ *iop->end = rs;
}
- if (rs == 0) {
- for (;;) {
- if (*bp++ == '\n' && *bp == '\n') {
+ if (RS == 0) {
+ extern int default_FS;
+
+ if (default_FS && (bp == start || eat_whitespace)) {
+ while (bp < iop->end && isspace(*bp))
bp++;
- break;
- }
+ if (bp == iop->end) {
+ eat_whitespace = 1;
+ continue;
+ } else
+ eat_whitespace = 0;
+ }
+ if (saw_newline && *bp == rs) {
+ bp++;
+ break;
}
+ saw_newline = 0;
+ }
+
+ while (*bp++ != rs)
+ ;
+
+ if (bp <= iop->end) {
+ if (RS == 0)
+ saw_newline = 1;
+ else
+ break;
} else
- while (*bp++ != rs)
- ;
- if (bp <= end_data) /* end of record */
- break;
- bp = end_data;
+ bp--;
}
- if (offset == iop->secbuf && start == bp && iop->cnt == 0) {
- *out = start;
+ if (iop->cnt == EOF && start == bp)
return EOF;
- }
+
iop->off = bp;
- iop->end = end_data;
- if (offset != iop->secbuf) {
- if (start != bp)
- COPY_TO_SECBUF
- start = iop->secbuf;
- bp = offset;
- }
- if (rs == 0) {
- if (*--bp == '\n') {
- *bp = '\0';
- if (*--bp == '\n')
- *bp = '\0';
- else
- bp++;
- } else
- bp++;
- } else if (*--bp == rs)
- ;
+ if (*--bp == rs)
+ *bp = '\0';
else
bp++;
- *bp = '\0';
+ if (RS == 0) {
+ if (*--bp == rs)
+ *bp = '\0';
+ else
+ bp++;
+ }
+
*out = start;
return bp - start;
}
@@ -225,13 +273,17 @@ char *argv[];
IOBUF *iop;
char *out;
int cnt;
+ char rs[2];
+ rs[0] = 0;
if (argc > 1)
bufsize = atoi(argv[1]);
+ if (argc > 2)
+ rs[0] = *argv[2];
iop = iop_alloc(0);
- while ((cnt = get_a_record(&out, iop, 0)) > 0) {
+ while ((cnt = get_a_record(&out, iop, rs[0])) > 0) {
fwrite(out, 1, cnt, stdout);
- fwrite("\n", 1, 1, stdout);
+ fwrite(rs, 1, 1, stdout);
}
}
#endif
diff --git a/main.c b/main.c
index 22d583dc..b0ca9e6d 100644
--- a/main.c
+++ b/main.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -118,7 +118,6 @@ char **argv;
extern char *optarg;
int i;
int do_nostalgia;
- int regex_mode = RE_SYNTAX_AWK;
(void) signal(SIGFPE, (SIGTYPE (*) P((int))) catchsig);
(void) signal(SIGSEGV, (SIGTYPE (*) P((int))) catchsig);
@@ -177,6 +176,10 @@ char **argv;
nostalgia();
/* NOTREACHED */
}
+ /* Tell the regex routines how they should work. . . */
+ (void) re_set_syntax(RE_SYNTAX_AWK);
+ regsyntax(RE_SYNTAX_AWK, 0);
+
while ((c = getopt (argc, argv, awk_opts)) != EOF) {
switch (c) {
@@ -225,12 +228,10 @@ char **argv;
case 'a': /* use old fashioned awk regexps */
warning("option -a will go away in the next release");
- /*regex_mode = RE_SYNTAX_AWK;*/
break;
case 'e': /* use Posix style regexps */
warning("option -e will go away in the next release");
- /*regex_mode = RE_SYNTAX_POSIX_AWK;*/
break;
case 'W': /* gawk specific options */
@@ -245,10 +246,6 @@ char **argv;
}
}
- /* Tell the regex routines how they should work. . . */
- (void) re_set_syntax(regex_mode);
- regsyntax(regex_mode, 0);
-
#ifdef DEBUG
setbuf(stdout, (char *) NULL); /* make debugging easier */
#endif
@@ -357,14 +354,21 @@ static void
copyleft ()
{
static char blurb[] =
-"Copyright (C) 1989, Free Software Foundation.\n\
-GNU Awk comes with ABSOLUTELY NO WARRANTY. This is free software, and\n\
-you are welcome to distribute it under the terms of the GNU General\n\
-Public License, which covers both the warranty information and the\n\
-terms for redistribution.\n\n\
-You should have received a copy of the GNU General Public License along\n\
-with this program; if not, write to the Free Software Foundation, Inc.,\n\
-675 Mass Ave, Cambridge, MA 02139, USA.\n";
+"Copyright (C) 1989, 1991, Free Software Foundation.\n\
+\n\
+This program is free software; you can redistribute it and/or modify\n\
+it under the terms of the GNU General Public License as published by\n\
+the Free Software Foundation; either version 2 of the License, or\n\
+(at your option) any later version.\n\
+\n\
+This program is distributed in the hope that it will be useful,\n\
+but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
+GNU General Public License for more details.\n\
+\n\
+You should have received a copy of the GNU General Public License\n\
+along with this program; if not, write to the Free Software\n\
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n";
fprintf(stderr, "%s, patchlevel %d\n", version_string, PATCHLEVEL);
fputs(blurb, stderr);
@@ -384,9 +388,15 @@ char *str;
* Only if in full compatibility mode check for the stupid special
* case so -F\t works as documented in awk even though the shell
* hands us -Ft. Bleah!
+ *
+ * Thankfully, Posix didn't propogate this "feature".
*/
- if (strict && str[0] == 't' && str[1] == '\0')
- str[0] = '\t';
+ if (str[0] == 't' && str[1] == '\0') {
+ if (do_lint)
+ warning("-Ft does not set FS to tab in POSIX awk");
+ if (strict && ! do_posix)
+ str[0] = '\t';
+ }
*tmp = make_str_node(str, len, SCAN); /* do process escapes */
set_FS();
}
diff --git a/missing/dup2.c b/missing/dup2.c
deleted file mode 100644
index 01068348..00000000
--- a/missing/dup2.c
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef F_DUPFD
-#include <fcntl.h>
-#endif
-
-int
-dup2 (old, new)
-int old, new;
-{
- (void) close(new);
-
- return fcntl(old, F_DUPFD, new);
-}
diff --git a/missing/gcvt.c b/missing/gcvt.c
deleted file mode 100644
index 1ebe41e1..00000000
--- a/missing/gcvt.c
+++ /dev/null
@@ -1,9 +0,0 @@
-char *
-gcvt(value, digits, buff)
-double value;
-int digits;
-char *buff;
-{
- sprintf(buff, "%*g", digits, value);
- return (buff);
-}
diff --git a/missing/getopt.c b/missing/getopt.c
index d9e52945..09a1b233 100644
--- a/missing/getopt.c
+++ b/missing/getopt.c
@@ -14,7 +14,7 @@
* manual page. The difference apparently involved a note about the
* famous rules 5 and 6, recommending using white space between an option
* and its first argument, and not grouping options that have arguments.
- * Getopt itself is currently lenient about both of these things White
+ * Getopt itself is currently lenient about both of these things. White
* space is allowed, but not mandatory, and the last option in a group can
* have an argument. That particular version of the man page evidently
* has no official existence, and my source at AT&T did not send a copy.
@@ -22,12 +22,14 @@
* However, I am not about to post a copy of anything licensed by AT&T.
*/
-#if defined(MSDOS) || defined(USG)
+#if defined(__STDC__) || defined(USG) || defined(MSDOS) || defined(VMS)
#define index strchr
#endif
/*LINTLIBRARY*/
+#ifndef NULL
#define NULL 0
+#endif
#define EOF (-1)
#define ERR(s, c) if(opterr){\
extern int write();\
diff --git a/missing/memcmp.c b/missing/memcmp.c
index e39c10ec..63cb5f8f 100644
--- a/missing/memcmp.c
+++ b/missing/memcmp.c
@@ -10,9 +10,9 @@ memcmp (s1, s2, l)
register char *s1, *s2;
register int l;
{
- for (; l--; s1++, s2++) {
+ for (; l-- > 0; s1++, s2++) {
if (*s1 != *s2)
return (*s1 - *s2);
}
- return (*--s1 - *--s2);
+ return (0);
}
diff --git a/missing/random.c b/missing/random.c
index 3708fe90..3cd675e4 100644
--- a/missing/random.c
+++ b/missing/random.c
@@ -19,7 +19,9 @@
static char sccsid[] = "@(#)random.c 5.5 (Berkeley) 7/6/88";
#endif /* LIBC_SCCS and not lint */
+#if 0
#include <stdio.h>
+#endif
/*
* random.c:
@@ -87,6 +89,10 @@ static char sccsid[] = "@(#)random.c 5.5 (Berkeley) 7/6/88";
#define BREAK_3 128
#define DEG_3 31
#define SEP_3 3
+#ifdef _CRAY
+#define DEG_3_P1 32 /* bug - do addition here */
+#define SEP_3_P1 4 /* *_3 + 1 = _3_P1 */
+#endif
#define TYPE_4 4 /* x**63 + x + 1 */
#define BREAK_4 256
@@ -142,7 +148,11 @@ static long randtbl[ DEG_3 + 1 ] = { TYPE_3,
* to point to randtbl[1] (as explained below).
*/
+#ifdef _CRAY
+static long *fptr = &randtbl[ SEP_3_P1 ];
+#else
static long *fptr = &randtbl[ SEP_3 + 1 ];
+#endif
static long *rptr = &randtbl[ 1 ];
@@ -165,7 +175,11 @@ static int rand_type = TYPE_3;
static int rand_deg = DEG_3;
static int rand_sep = SEP_3;
+#ifdef _CRAY
+static long *end_ptr = &randtbl[ DEG_3_P1 ];
+#else
static long *end_ptr = &randtbl[ DEG_3 + 1 ];
+#endif
@@ -236,7 +250,7 @@ initstate( seed, arg_state, n )
if( n < BREAK_1 ) {
if( n < BREAK_0 ) {
fprintf( stderr, "initstate: not enough state (%d bytes) with which to do jack; ignored.\n", n );
- return 0;
+ return;
}
rand_type = TYPE_0;
rand_deg = DEG_0;
diff --git a/missing/strcase.c b/missing/strcase.c
index d8fa674a..6834f27d 100644
--- a/missing/strcase.c
+++ b/missing/strcase.c
@@ -19,11 +19,7 @@
static char sccsid[] = "@(#)strcasecmp.c 5.6 (Berkeley) 6/27/88";
#endif /* LIBC_SCCS and not lint */
-#ifndef USG
-#include <sys/types.h>
-#else
#define u_char unsigned char
-#endif
/*
* This array is designed for mapping upper and lower case letter
@@ -55,10 +51,10 @@ static u_char charmap[] = {
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
- '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+ '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
- '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
- '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
+ '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
+ '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
diff --git a/missing/strerror.c b/missing/strerror.c
index e9c20804..badaf5dd 100644
--- a/missing/strerror.c
+++ b/missing/strerror.c
@@ -3,15 +3,15 @@
*/
/*
- * Copyright (C) 1986, 1988, 1989 the Free Software Foundation, Inc.
+ * Copyright (C) 1986, 1988, 1989, 1991 the Free Software Foundation, Inc.
*
* This file is part of GAWK, the GNU implementation of the
* AWK Progamming Language.
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,17 +20,12 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
extern int sys_nerr;
extern char *sys_errlist[];
-/* have to get right decl of sprintf early on */
-#ifndef BUFSIZ /* stdio specific definition */
-#include <stdio.h>
-#endif
-
char *
strerror(n)
int n;
diff --git a/missing/strftime.3 b/missing/strftime.3
new file mode 100644
index 00000000..b61ed029
--- /dev/null
+++ b/missing/strftime.3
@@ -0,0 +1,259 @@
+.TH STRFTIME 3
+.SH NAME
+strftime \- generate formatted time information
+.SH SYNOPSIS
+.ft B
+.nf
+#include <sys/types.h>
+#include <time.h>
+.sp
+size_t strftime(char *s, size_t maxsize, const char *format,
+ const struct tm *timeptr);
+.SH DESCRIPTION
+The following description is transcribed verbatim from the December 7, 1988
+draft standard for ANSI C.
+This draft is essentially identical in technical content
+to the final version of the standard.
+.LP
+The
+.B strftime
+function places characters into the array pointed to by
+.B s
+as controlled by the string pointed to by
+.BR format .
+The format shall be a multibyte character sequence, beginning and ending in
+its initial shift state.
+The
+.B format
+string consists of zero or more conversion specifiers and ordinary
+multibyte characters. A conversion specifier consists of a
+.B %
+character followed by a character that determines the behavior of the
+conversion specifier.
+All ordinary multibyte characters (including the terminating null
+character) are copied unchanged into the array.
+If copying takes place between objects that overlap the behavior is undefined.
+No more than
+.B maxsize
+characters are placed into the array.
+Each conversion specifier is replaced by appropriate characters as described
+in the following list.
+The appropriate characters are determined by the
+.B LC_TIME
+category of the current locale and by the values contained in the
+structure pointed to by
+.BR timeptr .
+.TP
+.B %a
+is replaced by the locale's abbreviated weekday name.
+.TP
+.B %A
+is replaced by the locale's full weekday name.
+.TP
+.B %b
+is replaced by the locale's abbreviated month name.
+.TP
+.B %B
+is replaced by the locale's full month name.
+.TP
+.B %c
+is replaced by the locale's appropriate date and time representation.
+.TP
+.B %d
+is replaced by the day of the month as a decimal number
+.RB ( 01 - 31 ).
+.TP
+.B %H
+is replaced by the hour (24-hour clock) as a decimal number
+.RB ( 00 - 23 ).
+.TP
+.B %I
+is replaced by the hour (12-hour clock) as a decimal number
+.RB ( 01 - 12 ).
+.TP
+.B %j
+is replaced by the day of the year as a decimal number
+.RB ( 001 - 366 ).
+.TP
+.B %m
+is replaced by the month as a decimal number
+.RB ( 01 - 12 ).
+.TP
+.B %M
+is replaced by the minute as a decimal number
+.RB ( 00 - 59 ).
+.TP
+.B %p
+is replaced by the locale's equivalent of the AM/PM designations associated
+with a 12-hour clock.
+.TP
+.B %S
+is replaced by the second as a decimal number
+.RB ( 00 - 61 ).
+.TP
+.B %U
+is replaced by the week number of the year (the first Sunday as the first
+day of week 1) as a decimal number
+.RB ( 00 - 53 ).
+.TP
+.B %w
+is replaced by the weekday as a decimal number
+.RB [ "0 " (Sunday)- 6 ].
+.TP
+.B %W
+is replaced by the week number of the year (the first Monday as the first
+day of week 1) as a decimal number
+.RB ( 00 - 53 ).
+.TP
+.B %x
+is replaced by the locale's appropriate date representation.
+.TP
+.B %X
+is replaced by the locale's appropriate time representation.
+.TP
+.B %y
+is replaced by the year without century as a decimal number
+.RB ( 00 - 99 ).
+.TP
+.B %Y
+is replaced by the year with century as a decimal number.
+.TP
+.B %Z
+is replaced by the time zone name or abbreviation, or by no characters if
+no time zone is determinable.
+.TP
+.B %%
+is replaced by
+.BR % .
+.LP
+If a conversion specifier is not one of the above, the behavior is
+undefined.
+.SH RETURNS
+If the total number of resulting characters including the terminating null
+character is not more than
+.BR maxsize ,
+the
+.B strftime
+function returns the number of characters placed into the array pointed to
+by
+.B s
+not including the terminating null character.
+Otherwise, zero is returned and the contents of the array are indeterminate.
+.SH NON-ANSI EXTENSIONS
+If
+.B SYSV_EXT
+is defined when the routine is compiled, then the following additional
+conversions will be available.
+These are borrowed from the System V
+.IR cftime (3)
+and
+.IR ascftime (3)
+routines.
+.TP
+.B %D
+is equivalent to specifying
+.BR %m/%d/%y .
+.TP
+.B %e
+is replaced by the day of the month,
+padded with a blank if it is only one digit.
+.TP
+.B %h
+is equivalent to
+.BR %b ,
+above.
+.TP
+.B %n
+is replaced with a newline character (\s-1ASCII LF\s+1).
+.TP
+.B %r
+is equivalent to specifying
+.BR "%I:%M:%S %p" .
+.TP
+.B %R
+is equivalent to specifying
+.BR %H:%M .
+.TP
+.B %T
+is equivalent to specifying
+.BR %H:%M:%S .
+.TP
+.B %t
+is replaced with a \s-1TAB\s+1 character.
+.SH POSIX 1003.2 EXTENSIONS
+If
+.B POSIX2_DATE
+is defined, then all of the conversions available with
+.B SYSV_EXT
+are available, as well as the
+following additional conversions:
+.TP
+.B %C
+The century, as a number between 00 and 99.
+.LP
+In additon, the alternate representations
+.BR %Ec ,
+.BR %EC ,
+.BR %Ex ,
+.BR %Ey ,
+.BR %EY ,
+.BR %Od ,
+.BR %Oe ,
+.BR %OH ,
+.BR %OI ,
+.BR %Om ,
+.BR %OM ,
+.BR %OS ,
+.BR %OU ,
+.BR %Ow ,
+.BR %OW ,
+and
+.B %Oy
+are recognized, but their normal representations are used.
+.SH VMS EXTENSIONS
+If
+.B VMS_EXT
+is defined, then the following additional conversion is available:
+.TP
+.B %V
+The date in VMS format (e.g. 20-JUN-1991).
+.SH SEE ALSO
+time(2), ctime(3), localtime(3)
+.SH BUGS
+This version does not handle multibyte characters or pay attention to the
+setting of the
+.B LC_TIME
+environment variable.
+.LP
+It is not clear what is ``appropriate'' for the C locale; the values
+returned are a best guess on the author's part.
+.SH CAVEATS
+This implementation calls
+.IR tzset (3)
+exactly once. If the
+.B TZ
+environment variable is changed after
+.B strftime
+has been called, then
+.IR tzset (3)
+must be called again, explicitly, in order for the
+correct timezone information to be available.
+.SH AUTHOR
+.nf
+Arnold Robbins
+AudioFAX, Inc.
+Suite 200
+2000 Powers Ferry Road
+Marietta, GA. 30067
+U.S.A.
+INTERNET: arnold@audiofax.com
+UUCP: emory!audfax!arnold
+Phone: +1 404 618 4281
+Fax-box: +1 404 618 4581
+.fi
+.SH ACKNOWLEDGEMENTS
+Thanks to Geoff Clare <gwc@root.co.uk> for helping debug earlier
+versions of this routine.
+Additional thanks to Arthur David Olsen <ado@elsie.nci.nih.gov>
+for some code improvements.
+
diff --git a/missing/strftime.c b/missing/strftime.c
new file mode 100644
index 00000000..11f41ce9
--- /dev/null
+++ b/missing/strftime.c
@@ -0,0 +1,384 @@
+/*
+ * strftime.c
+ *
+ * Public-domain relatively quick-and-dirty implemenation of
+ * ANSI library routine for System V Unix systems.
+ *
+ * It's written in old-style C for maximal portability.
+ * However, since I'm used to prototypes, I've included them too.
+ *
+ * If you want stuff in the System V ascftime routine, add the SYSV_EXT define.
+ * For stuff needed to implement the P1003.2 date command, add POSIX2_DATE.
+ *
+ * The code for %c, %x, and %X is my best guess as to what's "appropriate".
+ * This version ignores LOCALE information.
+ * It also doesn't worry about multi-byte characters.
+ * So there.
+ *
+ * Arnold Robbins
+ * January, February, March, 1991
+ *
+ * Fixes from ado@elsie.nci.nih.gov
+ * February 1991
+ */
+
+#if 0
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+#include <sys/types.h>
+#endif
+
+#ifndef __STDC__
+#define const /**/
+#endif
+
+#ifndef __STDC__
+extern void tzset();
+extern char *strchr();
+static int weeknumber();
+#else
+extern void tzset(void);
+extern char *strchr(const char *str, int ch);
+static int weeknumber(const struct tm *timeptr, int firstweekday);
+#endif
+
+#if !defined(MSDOS) && !defined(TZNAME_MISSING)
+extern char *tzname[2];
+extern int daylight;
+#endif
+
+#define SYSV_EXT 1 /* stuff in System V ascftime routine */
+#define POSIX2_DATE 1 /* stuff in Posix 1003.2 date command */
+#define VMS_EXT 1 /* include %V for VMS date format */
+
+#if defined(POSIX2_DATE) && ! defined(SYSV_EXT)
+#define SYSV_EXT 1
+#endif
+
+/* strftime --- produce formatted time */
+
+#ifndef __STDC__
+size_t
+strftime(s, maxsize, format, timeptr)
+char *s;
+size_t maxsize;
+const char *format;
+const struct tm *timeptr;
+#else
+size_t
+strftime(char *s, size_t maxsize, const char *format, const struct tm *timeptr)
+#endif
+{
+ char *endp = s + maxsize;
+ char *start = s;
+ char tbuf[100];
+ int i;
+ static short first = 1;
+
+ /* various tables, useful in North America */
+ static char *days_a[] = {
+ "Sun", "Mon", "Tue", "Wed",
+ "Thu", "Fri", "Sat",
+ };
+ static char *days_l[] = {
+ "Sunday", "Monday", "Tuesday", "Wednesday",
+ "Thursday", "Friday", "Saturday",
+ };
+ static char *months_a[] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+ };
+ static char *months_l[] = {
+ "January", "February", "March", "April",
+ "May", "June", "July", "August", "September",
+ "October", "November", "December",
+ };
+ static char *ampm[] = { "AM", "PM", };
+
+ if (s == NULL || format == NULL || timeptr == NULL || maxsize == 0)
+ return 0;
+
+ if (strchr(format, '%') == NULL && strlen(format) + 1 >= maxsize)
+ return 0;
+
+ if (first) {
+ tzset();
+ first = 0;
+ }
+
+ for (; *format && s < endp - 1; format++) {
+ tbuf[0] = '\0';
+ if (*format != '%') {
+ *s++ = *format;
+ continue;
+ }
+ again:
+ switch (*++format) {
+ case '\0':
+ *s++ = '%';
+ goto out;
+
+ case '%':
+ *s++ = '%';
+ continue;
+
+ case 'a': /* abbreviated weekday name */
+ if (timeptr->tm_wday < 0 || timeptr->tm_wday > 6)
+ strcpy(tbuf, "?");
+ else
+ strcpy(tbuf, days_a[timeptr->tm_wday]);
+ break;
+
+ case 'A': /* full weekday name */
+ if (timeptr->tm_wday < 0 || timeptr->tm_wday > 6)
+ strcpy(tbuf, "?");
+ else
+ strcpy(tbuf, days_l[timeptr->tm_wday]);
+ break;
+
+#ifdef SYSV_EXT
+ case 'h': /* abbreviated month name */
+#endif
+ case 'b': /* abbreviated month name */
+ if (timeptr->tm_mon < 0 || timeptr->tm_mon > 11)
+ strcpy(tbuf, "?");
+ else
+ strcpy(tbuf, months_a[timeptr->tm_mon]);
+ break;
+
+ case 'B': /* full month name */
+ if (timeptr->tm_mon < 0 || timeptr->tm_mon > 11)
+ strcpy(tbuf, "?");
+ else
+ strcpy(tbuf, months_l[timeptr->tm_mon]);
+ break;
+
+ case 'c': /* appropriate date and time representation */
+ sprintf(tbuf, "%s %s %2d %02d:%02d:%02d %d",
+ days_a[timeptr->tm_wday],
+ months_a[timeptr->tm_mon],
+ timeptr->tm_mday,
+ timeptr->tm_hour,
+ timeptr->tm_min,
+ timeptr->tm_sec,
+ timeptr->tm_year + 1900);
+ break;
+
+ case 'd': /* day of the month, 01 - 31 */
+ sprintf(tbuf, "%02d", timeptr->tm_mday);
+ break;
+
+ case 'H': /* hour, 24-hour clock, 00 - 23 */
+ sprintf(tbuf, "%02d", timeptr->tm_hour);
+ break;
+
+ case 'I': /* hour, 12-hour clock, 01 - 12 */
+ i = timeptr->tm_hour;
+ if (i == 0)
+ i = 12;
+ else if (i > 12)
+ i -= 12;
+ sprintf(tbuf, "%02d", i);
+ break;
+
+ case 'j': /* day of the year, 001 - 366 */
+ sprintf(tbuf, "%03d", timeptr->tm_yday + 1);
+ break;
+
+ case 'm': /* month, 01 - 12 */
+ sprintf(tbuf, "%02d", timeptr->tm_mon + 1);
+ break;
+
+ case 'M': /* minute, 00 - 59 */
+ sprintf(tbuf, "%02d", timeptr->tm_min);
+ break;
+
+ case 'p': /* am or pm based on 12-hour clock */
+ if (timeptr->tm_hour < 12)
+ strcpy(tbuf, ampm[0]);
+ else
+ strcpy(tbuf, ampm[1]);
+ break;
+
+ case 'S': /* second, 00 - 61 */
+ sprintf(tbuf, "%02d", timeptr->tm_sec);
+ break;
+
+ case 'U': /* week of year, Sunday is first day of week */
+ sprintf(tbuf, "%d", weeknumber(timeptr, 0));
+ break;
+
+ case 'w': /* weekday, Sunday == 0, 0 - 6 */
+ sprintf(tbuf, "%d", timeptr->tm_wday);
+ break;
+
+ case 'W': /* week of year, Monday is first day of week */
+ sprintf(tbuf, "%d", weeknumber(timeptr, 1));
+ break;
+
+ case 'x': /* appropriate date representation */
+ sprintf(tbuf, "%s %s %2d %d",
+ days_a[timeptr->tm_wday],
+ months_a[timeptr->tm_mon],
+ timeptr->tm_mday,
+ timeptr->tm_year + 1900);
+ break;
+
+ case 'X': /* appropriate time representation */
+ sprintf(tbuf, "%02d:%02d:%02d",
+ timeptr->tm_hour,
+ timeptr->tm_min,
+ timeptr->tm_sec);
+ break;
+
+ case 'y': /* year without a century, 00 - 99 */
+ i = timeptr->tm_year % 100;
+ sprintf(tbuf, "%d", i);
+ break;
+
+ case 'Y': /* year with century */
+ sprintf(tbuf, "%d", 1900 + timeptr->tm_year);
+ break;
+
+ case 'Z': /* time zone name or abbrevation */
+ i = 0;
+ if (
+#ifndef TZNAME_MISSING
+ daylight &&
+#endif
+ timeptr->tm_isdst)
+ i = 1;
+#ifdef TZNAME_MISSING
+ strcpy(tbuf, timeptr->tm_zone);
+#else
+ strcpy(tbuf, tzname[i]);
+#endif
+ break;
+
+#ifdef SYSV_EXT
+ case 'n': /* same as \n */
+ tbuf[0] = '\n';
+ tbuf[1] = '\0';
+ break;
+
+ case 't': /* same as \t */
+ tbuf[0] = '\t';
+ tbuf[1] = '\0';
+ break;
+
+ case 'D': /* date as %m/%d/%y */
+ strftime(tbuf, sizeof tbuf, "%m/%d/%y", timeptr);
+ break;
+
+ case 'e': /* day of month, blank padded */
+ sprintf(tbuf, "%2d", timeptr->tm_mday);
+ break;
+
+ case 'r': /* time as %I:%M:%S %p */
+ strftime(tbuf, sizeof tbuf, "%I:%M:%S %p", timeptr);
+ break;
+
+ case 'R': /* time as %H:%M */
+ strftime(tbuf, sizeof tbuf, "%H:%M", timeptr);
+ break;
+
+ case 'T': /* time as %H:%M:%S */
+ strftime(tbuf, sizeof tbuf, "%H:%M:%S", timeptr);
+ break;
+#endif
+
+
+#ifdef VMS_EXT
+ case 'V': /* date as dd-bbb-YYYY */
+ sprintf(tbuf, "%2d-%3.3s-%4d",
+ timeptr->tm_mday,
+ months_a[timeptr->tm_mon],
+ timeptr->tm_year + 1900);
+ for (i = 3; i < 6; i++)
+ if (islower(tbuf[i]))
+ tbuf[i] = toupper(tbuf[i]);
+ break;
+#endif
+
+
+#ifdef POSIX2_DATE
+ case 'C':
+ sprintf(tbuf, "%02d", (timeptr->tm_year + 1900) / 100);
+ break;
+
+
+ case 'E':
+ case 'O':
+ /* POSIX locale extensions, ignored for now */
+ goto again;
+#endif
+ default:
+ tbuf[0] = '%';
+ tbuf[1] = *format;
+ tbuf[2] = '\0';
+ break;
+ }
+ i = strlen(tbuf);
+ if (i)
+ if (s + i < endp - 1) {
+ strcpy(s, tbuf);
+ s += i;
+ } else
+ return 0;
+ }
+out:
+ if (s < endp && *format == '\0') {
+ *s = '\0';
+ return (s - start);
+ } else
+ return 0;
+}
+
+/* weeknumber --- figure how many weeks into the year */
+
+/* With thanks and tip of the hatlo to ado@elsie.nci.nih.gov */
+
+#ifndef __STDC__
+static int
+weeknumber(timeptr, firstweekday)
+const struct tm *timeptr;
+int firstweekday;
+#else
+static int
+weeknumber(const struct tm *timeptr, int firstweekday)
+#endif
+{
+ if (firstweekday == 0)
+ return (timeptr->tm_yday + 7 - timeptr->tm_wday) / 7;
+ else
+ return (timeptr->tm_yday + 7 -
+ (timeptr->tm_wday ? (timeptr->tm_wday - 1) : 6)) / 7;
+}
+
+#if 0
+/* ADR --- I'm loathe to mess with ado's code ... */
+
+Date: Wed, 24 Apr 91 20:54:08 MDT
+From: Michal Jaegermann <audfax!emory!vm.ucs.UAlberta.CA!NTOMCZAK>
+To: arnold@audiofax.com
+
+Hi Arnold,
+in a process of fixing of strftime() in libraries on Atari ST I grabbed
+some pieces of code from your own strftime. When doing that it came
+to mind that your weeknumber() function compiles a little bit nicer
+in the following form:
+/*
+ * firstweekday is 0 if starting in Sunday, non-zero if in Monday
+ */
+{
+ return (timeptr->tm_yday - timeptr->tm_wday +
+ (firstweekday ? (timeptr->tm_wday ? 8 : 1) : 7)) / 7;
+}
+How nicer it depends on a compiler, of course, but always a tiny bit.
+
+ Cheers,
+ Michal
+ ntomczak@vm.ucs.ualberta.ca
+#endif
diff --git a/missing/strtod.c b/missing/strtod.c
index 79350a1e..38c7ce50 100644
--- a/missing/strtod.c
+++ b/missing/strtod.c
@@ -21,13 +21,16 @@
* This bought us a 10% speedup on a sample program at uunet.uu.net.
*/
+#if 0
#include <ctype.h>
+#endif
extern double atof();
double
strtod (s, ptr)
-register char *s, **ptr;
+register const char *s;
+register char **ptr;
{
double ret = 0.0;
char *start = s;
diff --git a/missing/strtol.c b/missing/strtol.c
new file mode 100644
index 00000000..e102ae34
--- /dev/null
+++ b/missing/strtol.c
@@ -0,0 +1,120 @@
+/*
+Article 4291 of comp.lang.c:
+From: chris@mimsy.umd.edu (Chris Torek)
+Newsgroups: comp.lang.c
+Subject: Re: error checking strtol
+Message-ID: <24445@mimsy.umd.edu>
+Date: 17 May 90 09:31:17 GMT
+Organization: U of Maryland, Dept. of Computer Science, Coll. Pk., MD 20742
+
+The following is a working strtol. It depends only on the existence of
+correct header files (including <limits.h>) and on ASCII (IBM programmers
+will have to use strchr()). It does not support locales other than `C'.
+System V programmers should be able to replace their current strtol with
+this one. (After writing this, I checked the SVR2 source; it did not
+handle several cases correctly.)
+*/
+
+#ifdef __STDC__
+#include <limits.h>
+#else
+#define LONG_MIN (-0x80000000) /* for 32-bit 2s-complement at least */
+#define LONG_MAX 0x7fffffff
+#endif
+
+#if 0
+#include <limits.h>
+#include <ctype.h>
+#include <errno.h>
+#endif
+
+#ifndef _MSC_VER
+int errno;
+#endif
+
+/*
+ * Convert a string to a long integer.
+ *
+ * Ignores `locale' stuff. Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+long
+strtol(nptr, endptr, base)
+ const char *nptr;
+ char **endptr;
+ register int base;
+{
+ register const char *s = nptr;
+ register unsigned long acc;
+ register int c;
+ register unsigned long cutoff;
+ register int neg = 0, any, cutlim;
+
+ /*
+ * Skip white space and pick up leading +/- sign if any.
+ * If base is 0, allow 0x for hex and 0 for octal, else
+ * assume decimal; if base is already 16, allow 0x.
+ */
+ do {
+ c = *s++;
+ } while (isspace(c));
+ if (c == '-') {
+ neg = 1;
+ c = *s++;
+ } else if (c == '+')
+ c = *s++;
+ if ((base == 0 || base == 16) &&
+ c == '0' && (*s == 'x' || *s == 'X')) {
+ c = s[1];
+ s += 2;
+ base = 16;
+ }
+ if (base == 0)
+ base = c == '0' ? 8 : 10;
+
+ /*
+ * Compute the cutoff value between legal numbers and illegal
+ * numbers. That is the largest legal value, divided by the
+ * base. An input number that is greater than this value, if
+ * followed by a legal input character, is too big. One that
+ * is equal to this value may be valid or not; the limit
+ * between valid and invalid numbers is then based on the last
+ * digit. For instance, if the range for longs is
+ * [-2147483648..2147483647] and the input base is 10,
+ * cutoff will be set to 214748364 and cutlim to either
+ * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+ * a value > 214748364, or equal but the next digit is > 7 (or 8),
+ * the number is too big, and we will return a range error.
+ *
+ * Set any if any `digits' consumed; make it negative to indicate
+ * overflow.
+ */
+ cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
+ cutlim = cutoff % (unsigned long)base;
+ cutoff /= (unsigned long)base;
+ for (acc = 0, any = 0;; c = *s++) {
+ if (isdigit(c))
+ c -= '0';
+ else if (isalpha(c))
+ c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+ else
+ break;
+ if (c >= base)
+ break;
+ if (any < 0 || acc > cutoff || acc == cutoff && c > cutlim)
+ any = -1;
+ else {
+ any = 1;
+ acc *= base;
+ acc += c;
+ }
+ }
+ if (any < 0) {
+ acc = neg ? LONG_MIN : LONG_MAX;
+ errno = ERANGE;
+ } else if (neg)
+ acc = -acc;
+ if (endptr != 0)
+ *endptr = (char *) (any ? s - 1 : nptr);
+ return (acc);
+}
diff --git a/missing/system.c b/missing/system.c
new file mode 100644
index 00000000..bceca9e9
--- /dev/null
+++ b/missing/system.c
@@ -0,0 +1,7 @@
+int
+system(s)
+char *s;
+{
+ fatal("system() not supported on this system");
+ return 0;
+}
diff --git a/missing/tmpnam.c b/missing/tmpnam.c
deleted file mode 100644
index 8f49859a..00000000
--- a/missing/tmpnam.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * tmpnam - an implementation for systems lacking a library version
- * this version does not rely on the P_tmpdir and L_tmpnam constants.
- */
-
-#ifndef NULL
-#define NULL 0
-#endif
-
-static char template[] = "/tmp/gawkXXXXXX";
-
-char *
-tmpnam(tmp)
-char *tmp;
-{
- static char tmpbuf[sizeof(template)];
-
- if (tmp == NULL) {
- (void) strcpy(tmpbuf, template);
- (void) mktemp(tmpbuf);
- return tmpbuf;
- } else {
- (void) strcpy(tmp, template);
- (void) mktemp(tmp);
- return tmp;
- }
-}
diff --git a/missing/tzset.c b/missing/tzset.c
new file mode 100644
index 00000000..7e0af48a
--- /dev/null
+++ b/missing/tzset.c
@@ -0,0 +1,38 @@
+/*
+ * tzset.c
+ *
+ * Quick and dirty emulation of tzset(), tzname[], and daylight
+ * for old BSD systems without it.
+ *
+ * Thanks to Rick Adams, rick@uunet.uu.net, for the basics.
+ *
+ * BUGS:
+ * Totally ignores the value of the TZ environment variable.
+ */
+
+#if 0
+#include <sys/time.h>
+#endif
+
+static char tz1[1024];
+static char tz2[1024];
+
+/* external variables */
+char *tzname[2] = {
+ tz1, tz2
+};
+int daylight;
+
+extern char *timezone();
+
+void
+tzset()
+{
+ struct timeval tp;
+ struct timezone tz;
+
+ (void) gettimeofday(&tp, &tz);
+ (void) strcpy(tz1, timezone(tz.tz_minuteswest, 0));
+ (void) strcpy(tz2, timezone(tz.tz_minuteswest, 1));
+ daylight = tz.tz_dsttime;
+}
diff --git a/missing/vprintf.c b/missing/vprintf.c
index dea4ca86..bfa529e8 100644
--- a/missing/vprintf.c
+++ b/missing/vprintf.c
@@ -1,11 +1,4 @@
-#include <stdio.h>
-#include <varargs.h>
-
-#ifndef BUFSIZ
-#include <stdio.h>
-#endif
-
-#ifndef va_dcl
+#if 0
#include <varargs.h>
#endif
diff --git a/mkconf b/mkconf
index 2ff6dbff..3a39811c 100755
--- a/mkconf
+++ b/mkconf
@@ -11,7 +11,7 @@ case "$#" in
esac
if [ -f config/$1 ]; then
- sh ./mungeconf config/$1 config.h-dist >config.h
+ sh ./mungeconf config/$1 config.h.in >config.h
# echo #echo lines to stdout
sed -n '/^#echo /s///p' config/$1
@@ -19,14 +19,14 @@ if [ -f config/$1 ]; then
sed -n '/^MAKE_.*/s//s,^##&## ,,/p' config/$1 >sedscr
if [ -s sedscr ]
then
- sed -f sedscr Makefile-dist >Makefile
+ sed -f sedscr Makefile.in >Makefile
else
- cp Makefile-dist Makefile
+ cp Makefile.in Makefile
fi
rm -f sedscr
else
echo "\`$1' is not a known configuration."
echo "Either construct one based on the examples in the config directory,"
- echo "or copy config.h-dist to config.h and edit it."
+ echo "or copy config.h.in to config.h and edit it."
exit 1
fi
diff --git a/msg.c b/msg.c
index eb96c4d5..629470ee 100644
--- a/msg.c
+++ b/msg.c
@@ -10,7 +10,7 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
+ * the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* GAWK is distributed in the hope that it will be useful,
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -38,22 +38,24 @@ va_list *argp;
char *file;
(void) fflush(stdout);
- (void) fprintf(stderr, "%s: %s ", myname, s);
- vfprintf(stderr, msg, *argp);
- (void) fprintf(stderr, "\n");
- if (FNR) {
- (void) fprintf(stderr, " input line number %d", FNR);
- file = FILENAME_node->var_value->stptr;
- if (file && !STREQ(file, "-"))
- (void) fprintf(stderr, ", file `%s'", file);
- (void) fprintf(stderr, "\n");
- }
+ (void) fprintf(stderr, "%s: ", myname);
if (sourceline) {
- (void) fprintf(stderr, " source line number %d", sourceline);
if (source)
- (void) fprintf(stderr, ", file `%s'", source);
- (void) fprintf(stderr, "\n");
+ (void) fprintf(stderr, "%s:", source);
+ else
+ (void) fprintf(stderr, "cmd. line:");
+
+ (void) fprintf(stderr, "%d: ", sourceline);
}
+ if (FNR) {
+ file = FILENAME_node->var_value->stptr;
+ if (file)
+ (void) fprintf(stderr, "(FILENAME=%s ", file);
+ (void) fprintf(stderr, "FNR=%d) ", FNR);
+ }
+ (void) fprintf(stderr, s);
+ vfprintf(stderr, msg, *argp);
+ (void) fprintf(stderr, "\n");
(void) fflush(stderr);
}
@@ -81,7 +83,7 @@ va_dcl
va_start(args);
mesg = va_arg(args, char *);
- err("warning:", mesg, &args);
+ err("warning: ", mesg, &args);
va_end(args);
}
@@ -95,7 +97,7 @@ va_dcl
va_start(args);
mesg = va_arg(args, char *);
- err("fatal error:", mesg, &args);
+ err("fatal: ", mesg, &args);
va_end(args);
#ifdef DEBUG
abort();
diff --git a/node.c b/node.c
index 3bfc5e4a..2d16fc6b 100644
--- a/node.c
+++ b/node.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -240,8 +240,10 @@ int flags;
c = *pf++;
if (c == '\\') {
c = parse_escape(&pf);
- if (c < 0)
- cant_happen();
+ if (c < 0) {
+ warning("backslash at end of string");
+ c = '\\';
+ }
*pt++ = c;
} else
*pt++ = c;
diff --git a/patchlevel.h b/patchlevel.h
index c6161a1f..f3608240 100644
--- a/patchlevel.h
+++ b/patchlevel.h
@@ -1 +1 @@
-#define PATCHLEVEL 2
+#define PATCHLEVEL 3
diff --git a/pc/make.bat b/pc/make.bat
index 301fdb1e..99d76512 100644
--- a/pc/make.bat
+++ b/pc/make.bat
@@ -12,8 +12,8 @@ REM AWK Progamming Language.
REM
REM GAWK is free software; you can redistribute it and/or modify
REM it under the terms of the GNU General Public License as published by
-REM the Free Software Foundation; either version 1, or (at your option)
-REM any later version.
+REM the Free Software Foundation; either version 2 of the License, or
+REM (at your option) any later version.
REM
REM GAWK is distributed in the hope that it will be useful,
REM but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -22,7 +22,7 @@ REM GNU General Public License for more details.
REM
REM You should have received a copy of the GNU General Public License
REM along with GAWK; see the file COPYING. If not, write to
-REM the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+REM the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
REM
REM debug flags: DEBUG=#-DDEBUG #-DFUNC_TRACE -DMEMDEBUG
REM DEBUGGER= #-Zi
diff --git a/protos.h b/protos.h
index 5f84915b..fd752dfd 100644
--- a/protos.h
+++ b/protos.h
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifdef __STDC__
@@ -94,6 +94,7 @@ extern int close P((int));
extern int open P(());
extern int pipe P((int *));
extern int dup P((int));
+extern int dup2 P((int,int));
extern int fork P(());
extern int execl P((/* char *, char *, ... */));
extern int read P((int, char *, int));
diff --git a/re.c b/re.c
index 3b05e6bf..3909f0b1 100644
--- a/re.c
+++ b/re.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h"
@@ -39,6 +39,9 @@ int dfa;
memset((char *) rp, 0, sizeof(*rp));
emalloc(rp->pat.buffer, char *, 16, "make_regexp");
rp->pat.allocated = 16;
+ rp->regs.num_regs = 1;
+ emalloc(rp->regs.start, int *, sizeof(int), "make_regexp");
+ emalloc(rp->regs.end, int *, sizeof(int), "make_regexp");
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
if (ignorecase)
@@ -73,7 +76,7 @@ int need_start;
save1 = str[len];
str[len] = '\n';
save2 = str[len+1];
- ret = regexecute(&(rp->dfareg), str, str+len+1, 0, &count,
+ ret = regexecute(&(rp->dfareg), str, str+len+1, 1, &count,
&try_backref);
str[len] = save1;
str[len+1] = save2;
diff --git a/regex.c b/regex.c
index e59a169a..2cb3f2bd 100644
--- a/regex.c
+++ b/regex.c
@@ -1,9 +1,10 @@
/* Extended regular expression matching and search library.
- Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
+ Version 0.1.
+ Copyright (C) 1985, 89, 90, 91 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 1, or (at your option)
+ the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
@@ -19,68 +20,95 @@
/* To test, compile with -Dtest. This Dtestable feature turns this into
a self-contained program which reads a pattern, describes how it
compiles, then reads a string and searches for it.
-
+
On the other hand, if you compile with both -Dtest and -Dcanned you
can run some tests we've already thought of. */
+#ifdef GAWK
+#include "config.h"
+#endif
+
+#ifdef REGEX_MALLOC
+
+#define REGEX_ALLOCATE malloc
+#define REGEX_REALLOCATE(source, size) (realloc (source, size))
+
+#else /* not REGEX_MALLOC */
+
+
+/* Make alloca work the best possible way. */
+#ifdef __GNUC__
+#define alloca __builtin_alloca
+#else
+#ifdef sparc
+#include <alloca.h>
+#else
+#ifdef _AIX
+ #pragma alloca
+#else /* not __GNUC__ or sparc or _AIX */
+char *alloca ();
+#endif /* _AIX */
+#endif /* sparc */
+#endif /* not __GNUC__ */
+
+/* Still not defined (REGEX_MALLOC) */
+
+#define REGEX_ALLOCATE alloca
+
+/* Requires a `void *destination' declared. */
+#define REGEX_REALLOCATE(source, size) \
+ (destination = alloca (size), \
+ bcopy (source, destination, size), \
+ destination)
+
+#endif /* not defined (REGEX_MALLOC) */
+
+
#ifdef emacs
/* The `emacs' switch turns on certain special matching commands
that make sense only in emacs. */
+#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
-/* We write fatal error messages on standard error. */
-#include <stdio.h>
+/* Emacs uses `NULL' as a predicate. */
+#undef NULL
-/* isalpha(3) etc. are used for the character classes. */
-#include <ctype.h>
-#else /* not emacs */
+#else /* not emacs */
-#include "awk.h"
-#define NO_ALLOCA /* try it out for now */
-#ifndef NO_ALLOCA
-/* Make alloca work the best possible way. */
-#ifdef __GNUC__
-#ifndef atarist
-#ifndef alloca
-#define alloca __builtin_alloca
-#endif
-#endif /* atarist */
-#else
-#ifdef sparc
-#include <alloca.h>
+#include <sys/types.h> /* POSIX types. */
+
+#if defined(GAWK) || defined (USG) || defined (POSIX) || defined (STDC_HEADERS)
+#ifndef BSTRING
+#include <string.h>
+#define bcopy(s,d,n) memcpy ((d), (s), (n))
+#define bcmp(s1,s2,n) memcmp ((s1), (s2), (n))
+#define bzero(s,n) memset ((s), 0, (n))
+#endif /* not BSTRING */
+#endif /* USG or POSIX or STDC_HEADERS */
+
+#if defined (STDC_HEADERS)
+#include <stdlib.h>
#else
-char *alloca ();
-#endif
-#endif /* __GNUC__ */
-
-#define FREE_AND_RETURN_VOID(stackb) return
-#define FREE_AND_RETURN(stackb,val) return(val)
-#define DOUBLE_STACK(stackx,stackb,len) \
- (stackx = (unsigned char **) alloca (2 * len \
- * sizeof (unsigned char *)),\
- /* Only copy what is in use. */ \
- (unsigned char **) memcpy (stackx, stackb, len * sizeof (char *)))
-#else /* NO_ALLOCA defined */
-#define FREE_AND_RETURN_VOID(stackb) free(stackb);return
-#define FREE_AND_RETURN(stackb,val) free(stackb);return(val)
-#define DOUBLE_STACK(stackx,stackb,len) \
- (unsigned char **) realloc (stackb, 2 * len * sizeof (unsigned char *))
-#endif /* NO_ALLOCA */
-
-static void store_jump P((char *, int, char *));
-static void insert_jump P((int, char *, char *, char *));
-static void store_jump_n P((char *, int, char *, unsigned));
-static void insert_jump_n P((int, char *, char *, char *, unsigned));
-static void insert_op_2 P((int, char *, char *, int, int ));
-static int memcmp_translate P((unsigned char *, unsigned char *,
- int, unsigned char *));
+#ifdef __STDC__
+void *malloc (size_t);
+void *realloc (void *, size_t);
+#else /* not __STDC__ */
+char *malloc ();
+char *realloc ();
+#endif /* not __STDC__ */
+#endif /* not (POSIX or STDC_HEADERS) */
+
+
+
+/* How many characters in the character set. */
+#define CHAR_SET_SIZE 256
/* Define the syntax stuff, so we can do the \<, \>, etc. */
@@ -89,20 +117,18 @@ static int memcmp_translate P((unsigned char *, unsigned char *,
commands in re_match_2. */
#ifndef Sword
#define Sword 1
-#endif
+#endif /* not Sword */
#define SYNTAX(c) re_syntax_table[c]
#ifdef SYNTAX_TABLE
-char *re_syntax_table;
+extern char *re_syntax_table;
#else /* not SYNTAX_TABLE */
-static char re_syntax_table[256];
-static void init_syntax_once P((void));
-
+static char re_syntax_table[CHAR_SET_SIZE];
static void
init_syntax_once ()
@@ -113,7 +139,7 @@ init_syntax_once ()
if (done)
return;
- memset (re_syntax_table, 0, sizeof re_syntax_table);
+ bzero (re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
@@ -123,161 +149,244 @@ init_syntax_once ()
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
-
- /* Add specific syntax for ISO Latin-1. */
- for (c = 0300; c <= 0377; c++)
- re_syntax_table[c] = Sword;
- re_syntax_table[0327] = 0;
- re_syntax_table[0367] = 0;
+
+ re_syntax_table['_'] = Sword;
done = 1;
}
-#endif /* SYNTAX_TABLE */
-#undef P
-#endif /* emacs */
+#endif /* not SYNTAX_TABLE */
+#endif /* not emacs */
+/* We write fatal error messages on standard error. */
+#include <stdio.h>
+
+/* isalpha(3) etc. are used for the character classes. */
+#include <ctype.h>
/* Sequents are missing isgraph. */
-#ifndef isgraph
-#define isgraph(c) (isprint((c)) && !isspace((c)))
+#ifdef sequent
+#define ISGRAPH_MISSING
+#endif
+
+#ifdef ISGRAPH_MISSING
+#define isgraph(c) (isprint (c) && !isspace (c))
#endif
+
/* Get the interface, including the syntax bits. */
#include "regex.h"
+/* We will need this constant several times. */
+#define BYTEWIDTH 8
+
+
+
/* These are the command codes that appear in compiled regular
expressions, one per byte. Some command codes are followed by
argument bytes. A command code can specify any interpretation
whatsoever for its arguments. Zero-bytes may appear in the compiled
regular expression.
-
+
The value of `exactn' is needed in search.c (search_buffer) in emacs.
So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
`exactn' we use here must also be 1. */
enum regexpcode
{
- unused=0,
- exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
- begline, /* Fail unless at beginning of line. */
- endline, /* Fail unless at end of line. */
- jump, /* Followed by two bytes giving relative address to jump to. */
- on_failure_jump, /* Followed by two bytes giving relative address of
- place to resume at in case of failure. */
- finalize_jump, /* Throw away latest failure point and then jump to
- address. */
- maybe_finalize_jump, /* Like jump but finalize if safe to do so.
- This is used to jump back to the beginning
- of a repeat. If the command that follows
- this jump is clearly incompatible with the
- one at the beginning of the repeat, such that
- we can be sure that there is no use backtracking
- out of repetitions already completed,
- then we finalize. */
- dummy_failure_jump, /* Jump, and push a dummy failure point. This
- failure point will be thrown away if an attempt
- is made to use it for a failure. A + construct
- makes this before the first repeat. Also
- use it as an intermediary kind of jump when
- compiling an or construct. */
- succeed_n, /* Used like on_failure_jump except has to succeed n times;
- then gets turned into an on_failure_jump. The relative
- address following it is useless until then. The
- address is followed by two bytes containing n. */
- jump_n, /* Similar to jump, but jump n times only; also the relative
- address following is in turn followed by yet two more bytes
- containing n. */
- set_number_at, /* Set the following relative location to the
- subsequent number. */
- anychar, /* Matches any (more or less) one character. */
- charset, /* Matches any one char belonging to specified set.
- First following byte is number of bitmap bytes.
- Then come bytes for a bitmap saying which chars are in.
- Bits in each byte are ordered low-bit-first.
- A character is in the set if its bit is 1.
- A character too large to have a bit in the map
- is automatically not in the set. */
- charset_not, /* Same parameters as charset, but match any character
- that is not one of those specified. */
- start_memory, /* Start remembering the text that is matched, for
- storing in a memory register. Followed by one
- byte containing the register number. Register numbers
- must be in the range 0 through RE_NREGS. */
- stop_memory, /* Stop remembering the text that is matched
- and store it in a memory register. Followed by
- one byte containing the register number. Register
- numbers must be in the range 0 through RE_NREGS. */
- duplicate, /* Match a duplicate of something remembered.
- Followed by one byte containing the index of the memory
- register. */
- before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
- begbuf, /* Succeeds if at beginning of buffer. */
- endbuf, /* Succeeds if at end of buffer. */
- wordchar, /* Matches any word-constituent character. */
- notwordchar, /* Matches any char that is not a word-constituent. */
- wordbeg, /* Succeeds if at word beginning. */
- wordend, /* Succeeds if at word end. */
- wordbound, /* Succeeds if at a word boundary. */
- notwordbound,/* Succeeds if not at a word boundary. */
- syntaxspec, /* Matches any character whose syntax is specified.
- followed by a byte which contains a syntax code,
- e.g., Sword. */
- notsyntaxspec /* Matches any character whose syntax differs from
- that specified. */
+ no_op=0,
+ exactn=1, /* Followed by one byte giving n, then by n
+ literal bytes. */
+ begline, /* Fail unless at beginning of line. */
+ endline, /* Fail unless at end of line. */
+ endline_in_repeat, /* If in trailing position, turn into an endline,
+ otherwise, turn into a no_op. This should
+ never end up in the final compiled pattern! */
+ endline_before_newline,/* If after an endline, don't that endline turn into
+ an exactn for '$' when RE_CONTEXT_INDEP_ANCHORS
+ is set. Should never end up in the compiled
+ pattern! */
+ repeated_endline_before_newline, /* A combination of above two. */
+ no_pop_jump, /* Followed by two byte relative address to
+ which to jump. */
+ jump_past_next_alt, /* Same as no_pop_jump, but don't jump if the
+ current group (the largest-numbered active
+ one) hasn't matched anything. */
+ on_failure_jump, /* Followed by two byte relative address of
+ place to resume at in case of failure. */
+ pop_failure_jump, /* Throw away latest failure point and then jump to
+ address. */
+ maybe_pop_jump,
+ /* Like jump but change to pop_failure_jump
+ only if know won't have to backtrack to
+ match; otherwise change to no_pop_jump.
+ This is used to jump back to the
+ beginning of a repeat. If what follows
+ this jump clearly won't match what the
+ repeat does, such that we can be sure
+ that there is no use backtracking out of
+ repetitions already matched, then we
+ change it to a pop_failure_jump. */
+ dummy_failure_jump, /* Jump, and push a dummy failure point. This
+ failure point will be thrown away if an
+ attempt is made to use it for a failure. A
+ `+' construct makes this before the first
+ repeat. Also use it as an intermediary kind
+ of jump when compiling an alternative. */
+ succeed_n, /* Used like on_failure_jump except has to
+ succeed n times; The two-byte relative
+ address following it is useless until then.
+ The address is followed by two bytes
+ containing n. */
+ no_pop_jump_n, /* Similar to no_pop_jump, but jump n times
+ only; also the relative address following is
+ in turn followed by yet two more bytes
+ containing n. */
+ set_number_at, /* Set the following relative location (two
+ bytes) to the subsequent (two-byte) number. */
+ anychar, /* Matches any (more or less) character. */
+ charset, /* Matches any one char belonging to specified set.
+ First following byte is number of bitmap
+ bytes. Then come bytes for a bitmap saying
+ which chars are in. Bits in each byte are
+ ordered low-bit-first. A character is in the
+ set if its bit is 1. A character too large
+ to have a bit in the map is automatically not
+ in the set. */
+ charset_not, /* Same parameters as charset, but match any
+ character that is not one of those specified. */
+ start_memory, /* Start remembering the text that is matched, for
+ storing in a memory register. Followed by
+ one byte containing the register number.
+ Register numbers will be in the range 0
+ through one less than the pattern buffer's
+ re_nsub field. */
+ stop_memory, /* Stop remembering the text that is matched
+ and store it in a memory register. Followed
+ by one byte containing the register number.
+ Register numbers will be in the range 0
+ through one less than the pattern buffer's
+ re_nsub field. */
+ duplicate, /* Match a duplicate of something remembered.
+ Followed by one byte containing the register
+ number. */
+ before_dot, /* Succeeds if before point. */
+ at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
+ begbuf, /* Succeeds if at beginning of buffer. */
+ endbuf, /* Succeeds if at end of buffer. */
+ wordchar, /* Matches any word-constituent character. */
+ notwordchar, /* Matches any char that is not a word-constituent. */
+ wordbeg, /* Succeeds if at word beginning. */
+ wordend, /* Succeeds if at word end. */
+ wordbound, /* Succeeds if at a word boundary. */
+ notwordbound, /* Succeeds if not at a word boundary. */
+ syntaxspec, /* Matches any character whose syntax is specified.
+ followed by a byte which contains a syntax
+ code, e.g., Sword. */
+ notsyntaxspec /* Matches any character whose syntax differs from
+ that specified. */
};
-
-/* Number of failure points to allocate space for initially,
- when matching. If this number is exceeded, more space is allocated,
- so it is not a hard limit. */
-#ifndef NFAILURES
-#define NFAILURES 80
-#endif
-#ifdef CHAR_UNSIGNED
-#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
+#ifdef CHAR_UNSIGNED /* for, e.g., IBM RT */
+#define SIGN_EXTEND_CHAR(c) (((c)^128) - 128) /* As in Harbison and Steele. */
#endif
#ifndef SIGN_EXTEND_CHAR
-#define SIGN_EXTEND_CHAR(x) (x)
+#define SIGN_EXTEND_CHAR /* As nothing. */
#endif
-
+
+
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+
#define STORE_NUMBER(destination, number) \
- { (destination)[0] = (number) & 0377; \
- (destination)[1] = (number) >> 8; }
-
+ do {(destination)[0] = (number) & 0377; \
+ (destination)[1] = (number) >> 8; \
+ } while (0)
+
+
/* Same as STORE_NUMBER, except increment the destination pointer to
the byte after where the number is stored. Watch out that values for
DESTINATION such as p + 1 won't work, whereas p will. */
+
#define STORE_NUMBER_AND_INCR(destination, number) \
- { STORE_NUMBER(destination, number); \
- (destination) += 2; }
+ do { STORE_NUMBER(destination, number); \
+ (destination) += 2; \
+ } while (0)
+
+
+
+
/* Put into DESTINATION a number stored in two contingous bytes starting
at SOURCE. */
+
#define EXTRACT_NUMBER(destination, source) \
- { (destination) = *(source) & 0377; \
- (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
+ do { (destination) = *(source) & 0377; \
+ (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; \
+ } while (0)
+
+int
+extract_number (source)
+ unsigned char *source;
+{
+ int answer;
+ int i_temp = * (char *) (source + 1);
+ char c_temp = * (char *) (source + 1);
+
+ i_temp = SIGN_EXTEND_CHAR (i_temp);
+ c_temp = SIGN_EXTEND_CHAR (c_temp);
+
+ i_temp <<= 8;
+ c_temp <<= 8;
+
+ answer = *source & 0377;
+ answer += (SIGN_EXTEND_CHAR (*(char *)((source) + 1))) << 8;
+
+ return answer;
+}
+
/* Same as EXTRACT_NUMBER, except increment the pointer for source to
point to second byte of SOURCE. Note that SOURCE has to be a value
such as p, not, e.g., p + 1. */
+
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
- { EXTRACT_NUMBER (destination, source); \
- (source) += 2; }
+ do { EXTRACT_NUMBER (destination, source); \
+ (source) += 2; \
+ } while (0)
+
+
+void
+extract_number_and_incr (destination, source)
+ int *destination;
+ unsigned char **source;
+{
+ *destination = extract_number (*source);
+ *source += 2;
+}
+
+
+
+typedef enum { false = 0, true = 1 } boolean;
+
+/* Number of failure points for which to initially allocate space
+ when matching. If this number is exceeded, we allocate more space---
+ so it is not a hard limit. */
+
+#ifndef INIT_FAILURE_ALLOC
+#define INIT_FAILURE_ALLOC 5
+#endif
/* Specify the precise syntax of regexps for compilation. This provides
for compatibility for various utilities which historically have
different, incompatible syntaxes.
-
+
The argument SYNTAX is a bit-mask comprised of the various bits
defined in regex.h. */
@@ -296,134 +405,380 @@ re_set_syntax (syntax)
int obscure_syntax = 0;
+
+/* Routine used by re_compile_pattern, re_comp and regcomp. */
+
+#ifdef __STDC__
+static char *regex_compile (const char *pattern, const int size,
+ const int syntax, struct re_pattern_buffer *bufp);
+#else
+static char *regex_compile ();
+#endif
+
+
-/* Macros for re_compile_pattern, which is found below these definitions. */
+/* re_compile_pattern takes a regular-expression string and converts it
+ into a buffer full of byte commands for matching.
+
+ PATTERN is the address of the pattern string.
+ SIZE is the length of it.
+
+ BUFP is a struct re_pattern_buffer * whose pertinent fields are
+ mentioned below:
+
+ It has a char * field called BUFFER which points to the
+ space where this routine will put the compiled pattern; the
+ user can either allocate this using malloc (whereupon they
+ should set the long field ALLOCATED to the number of bytes
+ malloced) or set ALLOCATED to 0 and let the routine
+ allocate it. The routine may use realloc to enlarge the
+ buffer space.
+
+ If the user wants to translate all ordinary elements in the
+ compiled pattern, they should set the char * field
+ TRANSLATE to a translate table, otherwise, they should set
+ it to 0.
+
+ The routine sets the int field SYNTAX to the value of the
+ global variable `obscure_syntax'.
+
+ It returns in the long field USED how many bytes long the
+ compiled pattern is.
+
+ It returns 0 in the char field FASTMAP_ACCURATE, on
+ the assumption that the user usually doesn't compile the
+ same pattern twice and that consequently any fastmap in the
+ pattern buffer is inaccurate.
+
+ In the size_t field RE_NSUB, it returns the number of
+ subexpressions it found in PATTERN.
+
+ Returns 0 if the pattern was valid and an error string if it wasn't. */
+
+
+char *
+re_compile_pattern (pattern, size, bufp)
+ const char *pattern;
+ const int size;
+ struct re_pattern_buffer *bufp;
+{
+ bufp->return_default_num_regs = (obscure_syntax & RE_ALLOCATE_REGISTERS) > 0;
+
+ return regex_compile (pattern, size, obscure_syntax, bufp);
+}
+
+
+
+/* Macros for regex_compile. */
#define CHAR_CLASS_MAX_LENGTH 6
-/* Fetch the next character in the uncompiled pattern, translating it if
- necessary. */
+
+/* Fetch the next character in the uncompiled pattern---translating it
+ if necessary. */
+
#define PATFETCH(c) \
- {if (p == pend) goto end_of_pattern; \
- c = * (unsigned char *) p++; \
- if (translate) c = translate[c]; }
+ do {if (p == pend) goto end_of_pattern; \
+ c = * (unsigned char *) p++; \
+ if (translate) \
+ c = translate[c]; \
+ } while (0)
/* Fetch the next character in the uncompiled pattern, with no
translation. */
+
#define PATFETCH_RAW(c) \
- {if (p == pend) goto end_of_pattern; \
- c = * (unsigned char *) p++; }
+ do {if (p == pend) goto end_of_pattern; \
+ c = * (unsigned char *) p++; \
+ } while (0)
#define PATUNFETCH p--
+/* Pattern offset stuff. */
+
+#define INIT_PATTERN_OFFSETS_LIST_SIZE 32
+
+typedef short pattern_offset_type;
+
+typedef struct {
+ pattern_offset_type *offsets;
+ unsigned size;
+ unsigned avail;
+} pattern_offsets_list_type;
+
+#define PATTERN_OFFSETS_LIST_PTR_FULL(pattern_offsets_list_ptr) \
+ (pattern_offsets_list_ptr->avail == pattern_offsets_list_ptr->size)
+
+
+/* Anchor and op list stuff. */
+
+typedef pattern_offsets_list_type anchor_list_type;
+typedef pattern_offsets_list_type op_list_type;
+
+
+
+/* Bits list declaration. An arbitrarily long string of bits. */
+
+typedef struct {
+ unsigned *bits;
+ unsigned size;
+} bits_list_type;
+
+
+/* Bits list macros. See below for routines. */
+
+#define BITS_BLOCK_SIZE (sizeof (unsigned) * BYTEWIDTH)
+#define BITS_BLOCK(position) ((position) / BITS_BLOCK_SIZE)
+#define BITS_MASK(position) (1 << ((position) % BITS_BLOCK_SIZE))
+
+
+/* Initialize BITS_LIST (of type bits_list_type) to have one bits
+ block. Mostly analogous to routine init_bits_list, but, if
+ REGEX_MALLOC is not defined, uses `alloca' instead of `malloc'. This
+ is because using malloc in re_search* or re_match* could cause core
+ leaks when C-g is used in Emacs, plus malloc's slower and causes
+ storage fragmentation. This has to be a macro because the results of
+ `alloca' disappear at the end of the routine it's in. (If for some
+ reason you delete this explanation, please put it in the comment for
+ the failure stack.)
+
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+#define INIT_BITS_LIST(bits_list) \
+ (bits_list.bits = (unsigned *) REGEX_ALLOCATE (sizeof (unsigned)), \
+ bits_list.bits == NULL \
+ ? 0 \
+ : (bits_list.size = BITS_BLOCK_SIZE, \
+ bits_list.bits[0] = 0, \
+ 1))
+
+
+/* Extend BITS_LIST_PTR (of type bits_list_type) by one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't.
+ Analogous to routine extend_bits_list, but uses alloca instead of
+ realloc, for reasons stated above in INIT_BITS_LIST's comment.
+
+ Because REGEX_REALLOCATE requires a declaration of `void
+ *destination', so does this. */
+
+
+#define EXTEND_BITS_LIST(bits_list) \
+ (bits_list.bits \
+ = (unsigned *) REGEX_REALLOCATE (bits_list.bits, \
+ bits_list.size / BYTEWIDTH \
+ + BITS_BLOCK_SIZE / BYTEWIDTH), \
+ bits_list.bits == NULL \
+ ? 0 \
+ : (bits_list.size += BITS_BLOCK_SIZE, \
+ bits_list.bits[(bits_list.size/BITS_BLOCK_SIZE) - 1] = 0, \
+ 1))
+
+
+/* Set the bit for a positive POSITION in BITS_LIST_PTR to VALUE, which,
+ in turn, can only be 0 or 1.
+
+ Returns 1 if can set the bit.
+ 0 if ran out of memory allocating (if necessary) room for it.
+ value if the value is invalid (i.e., not 0 or 1).
+
+ Because EXTENT_BITS_LIST requires a declaration of `void
+ *destination', so does this. */
+
+#define SET_BIT_TO_VALUE(bits_list, position, value) \
+ (position > bits_list.size - 1 \
+ && !EXTEND_BITS_LIST (bits_list) \
+ ? 0 \
+ : (value == 1 \
+ ? (bits_list.bits[BITS_BLOCK (position)] \
+ |= BITS_MASK (position), 1) \
+ : (value == 0 \
+ ? (bits_list.bits[BITS_BLOCK (position)] \
+ &= ~(BITS_MASK (position)), 1) \
+ : value) \
+ ))
+
+
+
+/* Compile stack stuff. */
+
+typedef struct {
+ pattern_offset_type laststart_offset;
+ pattern_offset_type fixup_alt_jump;
+ pattern_offset_type regnum;
+ pattern_offset_type begalt_offset;
+} compile_stack_element;
+
+
+typedef struct {
+ compile_stack_element *stack;
+ unsigned size;
+ unsigned avail; /* Offset of next open position. */
+ } compile_stack_type;
+
+
+#define INIT_COMPILE_STACK_SIZE 32
+
+#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
+#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
+
+
/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE 28
+#define INIT_BUF_SIZE 32
/* Make sure we have at least N more bytes of space in buffer. */
#define GET_BUFFER_SPACE(n) \
{ \
- while (b - bufp->buffer + (n) >= bufp->allocated) \
- EXTEND_BUFFER; \
+ while (b - bufp->buffer + (n) > bufp->allocated) \
+ EXTEND_BUFFER \
}
-/* Make sure we have one more byte of buffer space and then add CH to it. */
-#define BUFPUSH(ch) \
- { \
+/* Make sure we have one more byte of buffer space and then add C to it. */
+#define BUF_PUSH(c) \
+ do { \
GET_BUFFER_SPACE (1); \
- *b++ = (char) (ch); \
- }
-
-/* Extend the buffer by twice its current size via reallociation and
- reset the pointers that pointed into the old allocation to point to
- the correct places in the new allocation. If extending the buffer
- results in it being larger than 1 << 16, then flag memory exhausted. */
+ *b++ = (char) (c); \
+ } while (0)
+
+/* Make sure we have two more bytes of buffer space and then add C1 and
+ C2 to it. */
+#define BUF_PUSH_2(c1, c2) \
+ do { \
+ GET_BUFFER_SPACE (2); \
+ *b++ = (char) (c1); \
+ *b++ = (char) (c2); \
+ } while (0)
+
+
+
+#define MAX_BUF_SIZE (1L << 16)
+
+/* Extend the buffer by twice its current size via realloc and
+ reset the pointers that pointed into the old block to point to the
+ correct places in the new one. If extending the buffer results in it
+ being larger than MAX_BUF_SIZE, then flag memory exhausted. */
#define EXTEND_BUFFER \
- { char *old_buffer = bufp->buffer; \
- if (bufp->allocated == (1L<<16)) goto too_big; \
- bufp->allocated *= 2; \
- if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
+ { \
+ char *old_buffer = bufp->buffer; \
+ if (bufp->allocated == MAX_BUF_SIZE) \
+ goto too_big; \
+ bufp->allocated <<= 1; \
+ if (bufp->allocated > MAX_BUF_SIZE) \
+ bufp->allocated = MAX_BUF_SIZE; \
bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
- if (bufp->buffer == 0) \
+ if (bufp->buffer == NULL) \
goto memory_exhausted; \
b = (b - old_buffer) + bufp->buffer; \
- if (fixup_jump) \
- fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
+ begalt = (begalt - old_buffer) + bufp->buffer; \
+ beg_interval = (beg_interval - old_buffer) + bufp->buffer; \
+ if (fixup_alt_jump) \
+ fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
if (laststart) \
laststart = (laststart - old_buffer) + bufp->buffer; \
- begalt = (begalt - old_buffer) + bufp->buffer; \
if (pending_exact) \
pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
}
-/* Set the bit for character C in a character set list. */
+/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
- { if (p != pend) \
- { \
- PATFETCH (c); \
- while (isdigit (c)) \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- if (p == pend) \
- break; \
- PATFETCH (c); \
- } \
- } \
- }
+ { if (p != pend) \
+ { \
+ PATFETCH (c); \
+ while (isdigit (c)) \
+ { \
+ if (num < 0) \
+ num = 0; \
+ num = num * 10 + c - '0'; \
+ if (p == pend) \
+ break; \
+ PATFETCH (c); \
+ } \
+ } \
+ }
+
+
+#define DO_RANGE \
+ { \
+ /* Get untranslated range start and end characters. */ \
+ char this_char = p[-2]; \
+ char end; \
+ \
+ if (p == pend) \
+ goto invalid_range_end; \
+ PATFETCH_RAW (end); \
+ if ((syntax & RE_NO_EMPTY_RANGES) && this_char > end) \
+ goto invalid_range_end; \
+ while (this_char <= end) \
+ { \
+ SET_LIST_BIT (translate ? translate[this_char] : this_char); \
+ this_char++; \
+ } \
+ }
+
+
+#define IS_CHAR_CLASS(string) \
+ (strcmp (string, "alpha") == 0 || strcmp (string, "upper") == 0 \
+ || strcmp (string, "lower") == 0 || strcmp (string, "digit") == 0 \
+ || strcmp (string, "alnum") == 0 || strcmp (string, "xdigit") == 0 \
+ || strcmp (string, "space") == 0 || strcmp (string, "print") == 0 \
+ || strcmp (string, "punct") == 0 || strcmp (string, "graph") == 0 \
+ || strcmp (string, "cntrl") == 0) \
+
+
+
+/* Subroutines for regex_compile. */
-/* Subroutines for re_compile_pattern. */
static void store_jump (), insert_jump (), store_jump_n (),
- insert_jump_n (), insert_op_2 ();
+ insert_jump_n (), insert_op_2 (), remove_intervening_anchors (),
+ clear_this_and_higher_levels (), increase_level (),
+ decrease_level (), adjust_pattern_offsets_list ();
-/* re_compile_pattern takes a regular-expression string
- and converts it into a buffer full of byte commands for matching.
+static unsigned record_anchor_position (), init_bits_list (),
+ get_level_match_status (),
+ set_this_level (), set_next_lower_level (),
+ make_group_active (), make_group_inactive (),
+ set_match_status_of_active_groups (),
+ get_group_match_status (), add_op (),
+ init_pattern_offsets_list ();
- PATTERN is the address of the pattern string
- SIZE is the length of it.
- BUFP is a struct re_pattern_buffer * which points to the info
- on where to store the byte commands.
- This structure contains a char * which points to the
- actual space, which should have been obtained with malloc.
- re_compile_pattern may use realloc to grow the buffer space.
+static boolean is_in_compile_stack (), lower_levels_match_nothing (),
+ no_levels_match_anything (), verify_and_adjust_endlines ();
- The number of bytes of commands can be found out by looking in
- the `struct re_pattern_buffer' that bufp pointed to, after
- re_compile_pattern returns. */
-char *
-re_compile_pattern (pattern, size, bufp)
- char *pattern;
- size_t size;
+static char *
+regex_compile (pattern, size, syntax, bufp)
+ const char *pattern;
+ const int size;
+ const int syntax;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
- register char *p = pattern;
- char *pend = pattern + size;
+ const char *p = pattern;
+ const char *pend = pattern + size;
register unsigned c, c1;
- char *p1;
+ const char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
+ boolean enough_memory;
/* Address of the count-byte of the most recently inserted `exactn'
command. This makes it possible to tell whether a new exact-match
character can be added to that command or requires a new `exactn'
command. */
-
+
char *pending_exact = 0;
- /* Address of the place where a forward-jump should go to the end of
+ /* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or', except the
- last, ends with a forward-jump of this sort. */
+ last, ends with a forward jump of this sort. */
- char *fixup_jump = 0;
+ char *fixup_alt_jump = 0;
/* Address of start of the most recently finished expression.
- This tells postfix * where to find the start of its operand. */
+ This tells, e. g., postfix * where to find the start of its operand. */
char *laststart = 0;
@@ -435,10 +790,10 @@ re_compile_pattern (pattern, size, bufp)
char many_times_ok;
- /* Address of beginning of regexp, or inside of last \(. */
+ /* Address of beginning of regexp, or inside of last group. */
char *begalt = b;
-
+
/* In processing an interval, at least this many matches must be made. */
int lower_bound;
@@ -447,19 +802,8 @@ re_compile_pattern (pattern, size, bufp)
/* Place in pattern (i.e., the {) to which to go back if the interval
is invalid. */
- char *beg_interval = 0;
-
- /* Stack of information saved by \( and restored by \).
- Four stack elements are pushed by each \(:
- First, the value of b.
- Second, the value of fixup_jump.
- Third, the value of regnum.
- Fourth, the value of begalt. */
-
- int stackb[40];
- int *stackp = stackb;
- int *stacke = stackb + 40;
- int *stackt;
+ const char *beg_interval = 0;
+ const char *following_left_brace = 0;
/* Counts \('s as they are encountered. Remembered for the matching \),
where it becomes the register number to put in the stop_memory
@@ -467,7 +811,69 @@ re_compile_pattern (pattern, size, bufp)
int regnum = 1;
+ compile_stack_type compile_stack;
+ anchor_list_type anchor_list;
+
+ /* Keeps track of whether or not the pattern at a given grouping level
+ matches the empty string so far. Each bit in the `bits' field of
+ this variable corresponds to a level, starting at level zero (i.e.,
+ the whole pattern) at the rightmost bit of list[0]. Level 1 is the
+ bit to the left of that, etc. Additional bits that won't fit in
+ bits[0] are in bits[2], bits[3], etc. */
+
+ bits_list_type level_match_status;
+ unsigned current_level = 0;
+
+ /* Does a similar thing for groups that the above variable does for
+ levels. */
+ bits_list_type group_match_status;
+
+ /* Keeps track of whether or not a given group is active. Accessed as
+ is group_match_status. */
+ bits_list_type group_active_status;
+
+ /* Keeps track of operations relevant to detecting valid position of '$'. */
+ op_list_type op_list;
+
+ /* Keeps track of whether or not hit a `$' since the the beginning of
+ the pattern or the last (if any) alternative; if so, then `^' is an
+ ordinary character. */
+
+ boolean had_an_endline = false;
+
+
+ compile_stack.stack
+ = (compile_stack_element *) malloc (INIT_COMPILE_STACK_SIZE
+ * sizeof (compile_stack_element));
+
+ if (compile_stack.stack == NULL)
+ goto memory_exhausted;
+
+ compile_stack.size = INIT_COMPILE_STACK_SIZE;
+ compile_stack.avail = 0;
+
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!init_pattern_offsets_list (&anchor_list,
+ INIT_COMPILE_STACK_SIZE << 1))
+ goto memory_exhausted;
+
+ if (!(init_bits_list (&level_match_status)
+ && init_bits_list (&group_match_status)
+ && init_bits_list (&group_active_status)))
+ goto memory_exhausted;
+
+
+ if (!init_pattern_offsets_list (&op_list, INIT_PATTERN_OFFSETS_LIST_SIZE))
+ goto memory_exhausted;
+
+
+ bufp->syntax = syntax;
bufp->fastmap_accurate = 0;
+ bufp->not_bol = bufp->not_eol = 0;
+
+ /* Always count groups, whether or not bufp->no_sub is set. */
+ bufp->re_nsub = 0;
#ifndef emacs
#ifndef SYNTAX_TABLE
@@ -476,15 +882,21 @@ re_compile_pattern (pattern, size, bufp)
#endif
#endif
+
if (bufp->allocated == 0)
{
bufp->allocated = INIT_BUF_SIZE;
if (bufp->buffer)
- /* EXTEND_BUFFER loses when bufp->allocated is 0. */
- bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
+ {
+ /* EXTEND_BUFFER loses when bufp->allocated is 0. This loses if
+ buffer's address is bogus. */
+ bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
+ }
else
- /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
+ {
+ /* Caller did not allocate a buffer. Do it for them. */
+ bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
+ }
if (!bufp->buffer) goto memory_exhausted;
begalt = b = bufp->buffer;
}
@@ -494,431 +906,669 @@ re_compile_pattern (pattern, size, bufp)
PATFETCH (c);
switch (c)
- {
- case '$':
- {
- char *p1 = p;
- /* When testing what follows the $,
- look past the \-constructs that don't consume anything. */
- if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- while (p1 != pend)
- {
- if (*p1 == '\\' && p1 + 1 != pend
- && (p1[1] == '<' || p1[1] == '>'
- || p1[1] == '`' || p1[1] == '\''
-#ifdef emacs
- || p1[1] == '='
-#endif
- || p1[1] == 'b' || p1[1] == 'B'))
- p1 += 2;
- else
- break;
- }
- if (obscure_syntax & RE_TIGHT_VBAR)
- {
- if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
- goto normal_char;
- /* Make operand of last vbar end before this `$'. */
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
- fixup_jump = 0;
- BUFPUSH (endline);
- break;
- }
- /* $ means succeed if at end of line, but only in special contexts.
- If validly in the middle of a pattern, it is a normal character. */
-
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
- goto invalid_pattern;
- if (p1 == pend || *p1 == '\n'
- || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
- || (obscure_syntax & RE_NO_BK_PARENS
- ? *p1 == ')'
- : *p1 == '\\' && p1[1] == ')')
- || (obscure_syntax & RE_NO_BK_VBAR
- ? *p1 == '|'
- : *p1 == '\\' && p1[1] == '|'))
+ {
+ case '$':
+ {
+ if ((syntax & RE_ANCHORS_ONLY_AT_ENDS) && p != pend
+ && (syntax & RE_CONTEXT_INVALID_ANCHORS))
+ goto invalid_pattern;
+
+ if (syntax & RE_TIGHT_ALT)
+ {
+ /* Make operand of last alternation jump to this endline. */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ fixup_alt_jump = 0;
+ }
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!record_anchor_position (!COMPILE_STACK_EMPTY,
+ b - bufp->buffer, &anchor_list))
+ goto memory_exhausted;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH ((p != pend && *p == '\n')
+ ? (int) endline_before_newline
+ : (int) endline);
+
+ /* If there's a chance this endline would have to turn into
+ `exactn 1 '$',' have to push dummy ops to make room;
+ can't insert later because would mess up any surrounding
+ jumps. */
+
+ if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
+ && !((syntax & RE_ANCHORS_ONLY_AT_ENDS) && p == pend))
{
- BUFPUSH (endline);
- break;
- }
- goto normal_char;
+ laststart = b - 1;
+ BUF_PUSH_2 (no_op, no_op);
+ }
+
+ had_an_endline = true;
+ break;
}
- case '^':
- /* ^ means succeed if at beg of line, but only if no preceding
- pattern. */
+
+ case '^':
+ /* If change anything in this case, have to change analogous
+ code in *endline* (yes, endline---because the routine goes
+ backwards through the pattern) case of the routine
+ verify_and_adjust_endlines. */
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
- goto invalid_pattern;
- if (laststart && p - 2 >= pattern && p[-2] != '\n'
- && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- if (obscure_syntax & RE_TIGHT_VBAR)
- {
- if (p != pattern + 1
- && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- BUFPUSH (begline);
- begalt = b;
- }
- else
- BUFPUSH (begline);
- break;
+ /* ^ means match the beginning of a string. If
+ RE_CONTEXT_INDEP_ANCHORS is set, then it represents the
+ match-beginning-of-line operator anywhere in the regular
+ expression.
+
+ If that bit isn't set, then it represents the
+ match-beginning-of-line operator in leading positions and
+ matches itself in other positions (unless it's invalid
+ there). */
+
+ /* If the '^' must be at the pattern's beginning or else is
+ in a leading position. */
+
+ if (((syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ || (syntax & RE_TIGHT_ALT))
+ ? p - 1 == pattern
+
+ /* If just after a newline, or... */
+ : ((p - 2 >= pattern && p[-2] == '\n')
+
+ /* ...no levels match anything, then in a leading position. */
+
+ || no_levels_match_anything (level_match_status)))
+ {
+ if (had_an_endline)
+ goto normal_char;
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!record_anchor_position (!COMPILE_STACK_EMPTY,
+ b - bufp->buffer, &anchor_list))
+ goto memory_exhausted;
+
+ }
+
+ else if (syntax & RE_CONTEXT_INVALID_ANCHORS)
+ goto invalid_pattern;
+
+ /* If not just after a newline and not always supposed to be
+ an anchor, consider it a ordinary character. */
+
+ else if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
+ && ((syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ /* To make, e.g., `^(^a)' match `^a'. */
+ ? p - 1 != pattern
+ : (int)laststart))
+ goto normal_char;
- case '+':
- case '?':
- if ((obscure_syntax & RE_BK_PLUS_QM)
- || (obscure_syntax & RE_LIMITED_OPS))
- goto normal_char;
- handle_plus:
- case '*':
- /* If there is no previous pattern, char not special. */
- if (!laststart)
+ if (syntax & RE_TIGHT_ALT)
+ {
+ if (p != pattern + 1 && !(syntax & RE_CONTEXT_INDEP_ANCHORS))
+ goto normal_char;
+
+ BUF_PUSH (begline);
+ begalt = b; /* Make alternative begin after the '^'. */
+ }
+ else
+ BUF_PUSH (begline);
+
+ break;
+
+ case '+':
+ case '?':
+ if ((syntax & RE_BK_PLUS_QM)
+ || (syntax & RE_LIMITED_OPS))
+ goto normal_char;
+ handle_plus:
+ case '*':
+ /* If there is no previous pattern... */
+ if (!laststart)
{
- if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- goto invalid_pattern;
- else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ goto missing_preceding_re;
+ else if (!(syntax & RE_CONTEXT_INDEP_OPS))
+ goto normal_char;
}
- /* If there is a sequence of repetition chars,
- collapse it down to just one. */
- zero_times_ok = 0;
- many_times_ok = 0;
- while (1)
- {
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
- if (p == pend)
- break;
- PATFETCH (c);
- if (c == '*')
- ;
- else if (!(obscure_syntax & RE_BK_PLUS_QM)
+
+ if ((syntax & RE_REPEATED_ANCHORS_AWAY)
+ && (enum regexpcode) *laststart == start_memory)
+ remove_intervening_anchors (laststart, b, anchor_list, bufp);
+
+ /* If there is a sequence of repetition chars, collapse it
+ down to just one. We can't combine interval operators with
+ these because we'd incorrect behavior for, e.g., `a{2}*',
+ which should only match an even number of `a's. */
+
+ zero_times_ok = 0;
+ many_times_ok = 0;
+
+ while (1)
+ {
+ zero_times_ok |= c != '+';
+ many_times_ok |= c != '?';
+
+ if (p == pend)
+ break;
+
+ PATFETCH (c);
+
+ if (c == '*')
+ {
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+ }
+ else if (!(syntax & RE_BK_PLUS_QM)
&& (c == '+' || c == '?'))
- ;
- else if ((obscure_syntax & RE_BK_PLUS_QM)
+ {
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+ }
+ else if ((syntax & RE_BK_PLUS_QM)
&& c == '\\')
{
- int c1;
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH (c1);
+
+ if (!(c1 == '+' || c1 == '?'))
{
PATUNFETCH;
PATUNFETCH;
break;
}
- c = c1;
+
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+
+ c = c1;
}
else
{
PATUNFETCH;
break;
}
- }
+ }
/* Star, etc. applied to an empty pattern is equivalent
to an empty pattern. */
- if (!laststart)
+ if (!laststart)
break;
- /* Now we know whether or not zero matches is allowed
+ /* Now we know whether or not zero matches is allowed
and also whether or not two or more matches is allowed. */
- if (many_times_ok)
- {
- /* If more than one repetition is allowed, put in at the
- end a backward relative jump from b to before the next
- jump we're going to put in below (which jumps from
- laststart to after this jump). */
- GET_BUFFER_SPACE (3);
- store_jump (b, maybe_finalize_jump, laststart - 3);
- b += 3; /* Because store_jump put stuff here. */
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ /* If more than one repetition is allowed, put in at the
+ end a backward relative jump from b to before the next jump
+ we're going to put in below (which jumps from laststart to
+ after this jump). */
+
+ if (many_times_ok)
+ {
+ GET_BUFFER_SPACE (3);
+ store_jump (b, maybe_pop_jump, laststart - 3);
+ b += 3; /* Because store_jump puts stuff here. */
}
+ /* Otherwise, put in a no_op so verify_and_adjust_endlines can
+ detect that, e.g., a preceding `$' is not an anchor. */
+ else
+ BUF_PUSH (no_op);
+
+
/* On failure, jump from laststart to b + 3, which will be the
end of the buffer after this jump is inserted. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer, &op_list);
GET_BUFFER_SPACE (3);
- insert_jump (on_failure_jump, laststart, b + 3, b);
- pending_exact = 0;
+ insert_jump (on_failure_jump, laststart, b + 3, b);
+ pending_exact = 0;
b += 3;
+
if (!zero_times_ok)
{
/* At least one repetition is required, so insert a
- dummy-failure before the initial on-failure-jump
+ dummy_failure before the initial on_failure_jump
instruction of the loop. This effects a skip over that
instruction the first time we hit that loop. */
- GET_BUFFER_SPACE (6);
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &op_list);
+ GET_BUFFER_SPACE (3);
insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
b += 3;
- }
+ }
break;
case '.':
laststart = b;
- BUFPUSH (anychar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (anychar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case '[':
- if (p == pend)
- goto invalid_pattern;
- while (b - bufp->buffer
- > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
- EXTEND_BUFFER;
+ {
+ unsigned just_had_a_char_class = 0;
+
+ if (p == pend)
+ goto unmatched_left_bracket;
- laststart = b;
- if (*p == '^')
- {
- BUFPUSH (charset_not);
- p++;
- }
- else
- BUFPUSH (charset);
- p1 = p;
+ while (b - bufp->buffer
+ > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
+ EXTEND_BUFFER;
- BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
- /* Clear the whole map */
- memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
-
- if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
- SET_LIST_BIT ('\n');
+ laststart = b;
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
- /* Read in characters and ranges, setting map bits. */
- while (1)
- {
- /* Don't translate while fetching, in case it's a range bound.
- When we set the bit for the character, we translate it. */
- PATFETCH_RAW (c);
+ if (*p == '^')
+ {
+ BUF_PUSH (charset_not);
+ p++;
+ }
+ else
+ BUF_PUSH (charset);
- /* If set, \ escapes characters when inside [...]. */
- if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
- {
- PATFETCH(c1);
- SET_LIST_BIT (c1);
- continue;
- }
- if (c == ']')
- {
- if (p == p1 + 1)
- {
- /* If this is an empty bracket expression. */
- if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
- && p == pend)
- goto invalid_pattern;
- }
- else
- /* Stop if this isn't merely a ] inside a bracket
- expression, but rather the end of a bracket
- expression. */
- break;
- }
- /* Get a range. */
- if (p[0] == '-' && p[1] != ']')
- {
- PATFETCH (c1);
- /* Don't translate the range bounds while fetching them. */
- PATFETCH_RAW (c1);
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
+ /* Clear the whole map */
+ bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
+
+ if ((syntax & RE_HAT_LISTS_NOT_NEWLINE)
+ && (enum regexpcode) b[-2] == charset_not)
+ SET_LIST_BIT ('\n');
+
+
+ /* Read in characters and ranges, setting map bits. */
+ while (1)
+ {
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ PATFETCH (c);
+
+
+ /* If set, \ escapes characters when inside [...]. */
+ if ((syntax & RE_AWK_CLASS_HACK) && c == '\\')
+ {
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH(c1);
+ SET_LIST_BIT (c1);
+ continue;
+ }
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+
+ if (c == ']' && p != p1 + 1)
+ break;
+
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+
+ if (just_had_a_char_class && c == '-' && *p != ']')
+ goto invalid_range_end;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+
+ if (c == '-'
+ && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']')
+ {
+ DO_RANGE;
+ }
- if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
- goto invalid_pattern;
+ else if (p[0] == '-' && p[1] != ']')
+ {
+ /* This handles ranges made up of characters only. */
+ PATFETCH (c1); /* The `-'. */
+ DO_RANGE;
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+
+ else if ((syntax & RE_CHAR_CLASSES)
+ && c == '[' && p[0] == ':')
+ {
+ /* Longest valid character class word has six chars. */
+ char str[CHAR_CLASS_MAX_LENGTH];
- if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
- && c1 == '-' && *p != ']')
- goto invalid_pattern;
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ while (1)
+ {
+ /* Don't translate the ``character class''
+ characters. */
+ PATFETCH_RAW (c);
+ if (c == ':' || c == ']' || p == pend
+ || c1 == CHAR_CLASS_MAX_LENGTH)
+ break;
+ str[c1++] = c;
+ }
+ str[c1] = '\0';
- while (c <= c1)
- {
- /* Translate each char that's in the range. */
- if (translate)
- SET_LIST_BIT (translate[c]);
- else
- SET_LIST_BIT (c);
- c++;
- }
- }
- else if ((obscure_syntax & RE_CHAR_CLASSES)
- && c == '[' && p[0] == ':')
- {
- /* Longest valid character class word has six characters. */
- char str[CHAR_CLASS_MAX_LENGTH];
- PATFETCH (c);
- c1 = 0;
- /* If no ] at end. */
- if (p == pend)
- goto invalid_pattern;
- while (1)
- {
- /* Don't translate the ``character class'' characters. */
- PATFETCH_RAW (c);
- if (c == ':' || c == ']' || p == pend
- || c1 == CHAR_CLASS_MAX_LENGTH)
- break;
- str[c1++] = c;
- }
- str[c1] = '\0';
- if (p == pend
- || c == ']' /* End of the bracket expression. */
- || p[0] != ']'
- || p + 1 == pend
- || (strcmp (str, "alpha") != 0
- && strcmp (str, "upper") != 0
- && strcmp (str, "lower") != 0
- && strcmp (str, "digit") != 0
- && strcmp (str, "alnum") != 0
- && strcmp (str, "xdigit") != 0
- && strcmp (str, "space") != 0
- && strcmp (str, "print") != 0
- && strcmp (str, "punct") != 0
- && strcmp (str, "graph") != 0
- && strcmp (str, "cntrl") != 0))
- {
- /* Undo the ending character, the letters, and leave
- the leading : and [ (but set bits for them). */
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT (':');
- }
- else
- {
- /* The ] at the end of the character class. */
- PATFETCH (c);
- if (c != ']')
- goto invalid_pattern;
- for (c = 0; c < (1 << BYTEWIDTH); c++)
- {
- if ((strcmp (str, "alpha") == 0 && isalpha (c))
- || (strcmp (str, "upper") == 0 && isupper (c))
- || (strcmp (str, "lower") == 0 && islower (c))
- || (strcmp (str, "digit") == 0 && isdigit (c))
- || (strcmp (str, "alnum") == 0 && isalnum (c))
- || (strcmp (str, "xdigit") == 0 && isxdigit (c))
- || (strcmp (str, "space") == 0 && isspace (c))
- || (strcmp (str, "print") == 0 && isprint (c))
- || (strcmp (str, "punct") == 0 && ispunct (c))
- || (strcmp (str, "graph") == 0 && isgraph (c))
- || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
- SET_LIST_BIT (c);
- }
- }
- }
- else if (translate)
- SET_LIST_BIT (translate[c]);
- else
- SET_LIST_BIT (c);
- }
+ /* If isn't a word bracketed by `[:' and:`]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but set bits for them). */
+
+ if (c == ':' && p[0] == ']')
+ {
+ if (!IS_CHAR_CLASS (str))
+ goto invalid_char_class;
+
+ /* The ] at the end of the character class. */
+ PATFETCH (c);
+
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ for (c = 0; c < (1 << BYTEWIDTH); c++)
+ {
+ if ((strcmp (str, "alpha") == 0 && isalpha (c))
+ || (strcmp (str, "upper") == 0 && isupper (c))
+ || (strcmp (str, "lower") == 0 && islower (c))
+ || (strcmp (str, "digit") == 0 && isdigit (c))
+ || (strcmp (str, "alnum") == 0 && isalnum (c))
+ || (strcmp (str, "xdigit") == 0 && isxdigit (c))
+ || (strcmp (str, "space") == 0 && isspace (c))
+ || (strcmp (str, "print") == 0 && isprint (c))
+ || (strcmp (str, "punct") == 0 && ispunct (c))
+ || (strcmp (str, "graph") == 0 && isgraph (c))
+ || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
+ SET_LIST_BIT (c);
+ }
+ just_had_a_char_class = 1;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT (':');
+ just_had_a_char_class = 0;
+ }
+ }
+ else
+ {
+ just_had_a_char_class = 0;
+ SET_LIST_BIT (c);
+ }
+ }
+
+ /* Discard any (non)matching list bytes that are all 0 at the
+ end of the map. Decrement the map-length byte too. */
+
+ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
+ b[-1]--;
+ b += b[-1];
+ }
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
- /* Discard any character set/class bitmap bytes that are all
- 0 at the end of the map. Decrement the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
break;
- case '(':
- if (! (obscure_syntax & RE_NO_BK_PARENS))
+
+ case '(':
+ if (!(syntax & RE_NO_BK_PARENS))
goto normal_char;
else
goto handle_open;
case ')':
- if (! (obscure_syntax & RE_NO_BK_PARENS))
+ if (! (syntax & RE_NO_BK_PARENS))
goto normal_char;
else
goto handle_close;
case '\n':
- if (! (obscure_syntax & RE_NEWLINE_OR))
+ if (! (syntax & RE_NEWLINE_ALT))
goto normal_char;
else
goto handle_bar;
case '|':
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- && (! laststart || p == pend))
- goto invalid_pattern;
- else if (! (obscure_syntax & RE_NO_BK_VBAR))
+ if (!(syntax & RE_NO_BK_VBAR))
goto normal_char;
else
goto handle_bar;
case '{':
- if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
- && (obscure_syntax & RE_INTERVALS)))
- goto normal_char;
- else
+ if ((syntax & RE_NO_BK_BRACES)
+ && (syntax & RE_INTERVALS))
goto handle_interval;
+ else
+ goto normal_char;
case '\\':
- if (p == pend) goto invalid_pattern;
- PATFETCH_RAW (c);
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH_RAW (c);
switch (c)
{
- case '(':
- if (obscure_syntax & RE_NO_BK_PARENS)
+ case '(':
+ if (syntax & RE_NO_BK_PARENS)
goto normal_backsl;
- handle_open:
- if (stackp == stacke) goto nesting_too_deep;
+ handle_open:
+ bufp->re_nsub++;
+ increase_level (&current_level);
- /* Laststart should point to the start_memory that we are about
- to push (unless the pattern has RE_NREGS or more ('s). */
- *stackp++ = b - bufp->buffer;
- if (regnum < RE_NREGS)
- {
- BUFPUSH (start_memory);
- BUFPUSH (regnum);
- }
- *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
- *stackp++ = regnum++;
- *stackp++ = begalt - bufp->buffer;
- fixup_jump = 0;
+ if (!make_group_active (&group_active_status, regnum))
+ goto memory_exhausted;
+
+ if (syntax & RE_NO_EMPTY_GROUPS)
+ {
+ p1 = p;
+ if (*p1 == '^') p1++;
+ if (*p1 == '$') p1++;
+ if (!(syntax & RE_NO_BK_PARENS) && *p1 == '\\') p1++;
+
+ /* If found an empty group... */
+ if (*p1 == ')')
+ goto invalid_pattern;
+ }
+
+ /* Value to restore in laststart when hit end of this
+ group; should point to the start_memory that we are
+ about to push. */
+
+ if (COMPILE_STACK_FULL)
+ {
+ compile_stack.stack = (compile_stack_element *)
+ realloc (compile_stack.stack,
+ (compile_stack.size << 1)
+ * sizeof (compile_stack_element));
+
+ if (compile_stack.stack == NULL)
+ goto memory_exhausted;
+
+ compile_stack.size <<= 1;
+ }
+
+ compile_stack.stack[compile_stack.avail].laststart_offset
+ = b - bufp->buffer;
+ compile_stack.stack[compile_stack.avail].fixup_alt_jump
+ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
+ compile_stack.stack[compile_stack.avail].regnum = regnum;
+ compile_stack.stack[compile_stack.avail].begalt_offset
+ = begalt - bufp->buffer;
+ compile_stack.avail++;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (start_memory, regnum);
+ regnum++;
+ fixup_alt_jump = 0;
laststart = 0;
begalt = b;
- break;
+ break;
case ')':
- if (obscure_syntax & RE_NO_BK_PARENS)
+ if (syntax & RE_NO_BK_PARENS)
goto normal_backsl;
- handle_close:
- if (stackp == stackb) goto unmatched_close;
- begalt = *--stackp + bufp->buffer;
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
- if (stackp[-1] < RE_NREGS)
- {
- BUFPUSH (stop_memory);
- BUFPUSH (stackp[-1]);
- }
- stackp -= 2;
- fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
- laststart = *--stackp + bufp->buffer;
+
+ if (COMPILE_STACK_EMPTY)
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_backsl;
+ else
+ goto unmatched_close;
+
+ handle_close:
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ /* See similar code for backslashed parens above. */
+
+ if (COMPILE_STACK_EMPTY)
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_char;
+ else
+ goto unmatched_close;
+
+ if (get_level_match_status (level_match_status, current_level))
+ if (!set_next_lower_level (&level_match_status, current_level))
+ goto memory_exhausted;
+
+ /* Only call these if know you have a matched close. */
+ decrease_level (&current_level);
+ make_group_inactive (&group_active_status, regnum);
+
+ compile_stack.avail--;
+ begalt
+ = compile_stack.stack[compile_stack.avail].begalt_offset
+ + bufp->buffer;
+ laststart
+ = (compile_stack.stack[compile_stack.avail].laststart_offset
+ + bufp->buffer);
+
+ fixup_alt_jump = compile_stack.stack[compile_stack.avail].fixup_alt_jump
+ ? compile_stack.stack[compile_stack.avail]
+ .fixup_alt_jump + bufp->buffer - 1
+ : 0;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (stop_memory,
+ compile_stack.stack[compile_stack.avail].regnum);
break;
- case '|':
- if ((obscure_syntax & RE_LIMITED_OPS)
- || (obscure_syntax & RE_NO_BK_VBAR))
+ case '|': /* `\|'. */
+ if ((syntax & RE_LIMITED_OPS)
+ || (syntax & RE_NO_BK_VBAR))
goto normal_backsl;
handle_bar:
- if (obscure_syntax & RE_LIMITED_OPS)
+ if (syntax & RE_LIMITED_OPS)
goto normal_char;
- /* Insert before the previous alternative a jump which
+
+ /* Disallow empty alternatives if RE_NO_EMPTY_ALTS is set.
+ Caveat: can't detect if the vbar is followed by a
+ trailing '$' yet, unless it's the last thing in a
+ pattern; the routine for verifying endlines has to do
+ the rest. */
+
+ if ((syntax & RE_NO_EMPTY_ALTS)
+ && (!laststart || p == pend
+ || (*p == '$' && p + 1 == pend)
+ || ((syntax & RE_NO_BK_PARENS)
+ ? (p < pend && *p == ')')
+ : (p + 1 < pend && p[0] == '\\' && p[1] == ')'))))
+ goto invalid_pattern;
+
+
+ /* Clear some variables. */
+
+ if (lower_levels_match_nothing (level_match_status,
+ current_level))
+ clear_this_and_higher_levels (&level_match_status,
+ current_level);
+ had_an_endline = false;
+
+
+ /* Insert before the previous alternative a jump which
jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (6);
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, begalt - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, begalt - bufp->buffer, &op_list);
+ GET_BUFFER_SPACE (3);
insert_jump (on_failure_jump, begalt, b + 6, b);
pending_exact = 0;
b += 3;
- /* The alternative before the previous alternative has a
- jump after it which gets executed if it gets matched.
- Adjust that jump so it will jump to the previous
- alternative's analogous jump (put in below, which in
- turn will jump to the next (if any) alternative's such
- jump, etc.). The last such jump jumps to the correct
- final destination. */
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
+
+ /* The alternative before this one has a jump after it
+ which gets executed if it gets matched. Adjust that
+ jump so it will jump to this alternative's analogous
+ jump (put in below, which in turn will jump to the next
+ (if any) alternative's such jump, etc.). The last such
+ jump jumps to the correct final destination. A picture:
+ _____ _____
+ | | | |
+ | v | v
+ a | b | c
+
+ If we are at `b,' then fixup_alt_jump right now points to a
+ three-byte space after `a.' We'll put in the jump, set
+ fixup_alt_jump to right after `b,' and leave behind three
+ bytes which we'll fill in when we get to after `c.' */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
- /* Leave space for a jump after previous alternative---to be
- filled in later. */
- fixup_jump = b;
+ /* Mark and leave space for a jump after this alternative
+ ---to be filled in later either by next alternative or
+ when know we're at the end of a series of alternatives. */
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ fixup_alt_jump = b;
+ GET_BUFFER_SPACE (3);
b += 3;
laststart = 0;
@@ -926,93 +1576,167 @@ re_compile_pattern (pattern, size, bufp)
break;
case '{':
- if (! (obscure_syntax & RE_INTERVALS)
- /* Let \{ be a literal. */
- || ((obscure_syntax & RE_INTERVALS)
- && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
- /* If it's the string "\{". */
- || (p - 2 == pattern && p == pend))
+ /* If \{ is a literal. */
+ if (!(syntax & RE_INTERVALS)
+ /* If we're at a "\{" and it's not the open-interval
+ operator. */
+ || ((syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES))
+ || (p - 2 == pattern && p == pend))
goto normal_backsl;
+
handle_interval:
- beg_interval = p - 1; /* The {. */
- /* If there is no previous pattern, this isn't an interval. */
- if (!laststart)
- {
- if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- goto invalid_pattern;
- else
- goto normal_backsl;
- }
- /* It also isn't an interval if not preceded by an re
- matching a single character or subexpression, or if
- the current type of intervals can't handle back
- references and the previous thing is a back reference. */
- if (! (*laststart == anychar
- || *laststart == charset
- || *laststart == charset_not
- || *laststart == start_memory
- || (*laststart == exactn && laststart[1] == 1)
- || (! (obscure_syntax & RE_NO_BK_REFS)
- && *laststart == duplicate)))
- {
- if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
- goto normal_char;
-
- /* Posix extended syntax is handled in previous
- statement; this is for Posix basic syntax. */
- if (obscure_syntax & RE_INTERVALS)
- goto invalid_pattern;
-
- goto normal_backsl;
- }
+ /* If got here, then intervals must be allowed. */
+
+ beg_interval = p - 1; /* The `{'. */
+ following_left_brace = 0;
lower_bound = -1; /* So can see if are set. */
upper_bound = -1;
+
+ if (p == pend)
+ {
+ if (syntax & RE_NO_BK_BRACES)
+ goto unfetch_interval;
+ else
+ goto unmatched_left_curly_brace;
+ }
+
GET_UNSIGNED_NUMBER (lower_bound);
- if (c == ',')
+
+ if (c == ',')
{
- GET_UNSIGNED_NUMBER (upper_bound);
+ GET_UNSIGNED_NUMBER (upper_bound);
if (upper_bound < 0)
upper_bound = RE_DUP_MAX;
}
+
if (upper_bound < 0)
upper_bound = lower_bound;
- if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
+
+ if (lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound)
+ {
+ if (syntax & RE_NO_BK_BRACES)
+ goto unfetch_interval;
+ else
+ goto invalid_braces_content;
+ }
+
+ if (!(syntax & RE_NO_BK_BRACES))
{
if (c != '\\')
- goto invalid_pattern;
+ goto unmatched_left_curly_brace;
+
PATFETCH (c);
}
- if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound
- || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
- && p != pend && *p == '{'))
- {
- if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
+
+ if (c != '}')
+ {
+ if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
- else
- goto invalid_pattern;
- }
+ else
+ goto invalid_braces_content;
+ }
+
- /* If upper_bound is zero, don't want to succeed at all;
+ /* Parsed a valid interval, but if an interval can't
+ operate on another repetition operator, check that what
+ follows isn't one. */
+
+ if ((syntax & RE_NO_CONSECUTIVE_REPEATS) && p != pend)
+ {
+ if (*p == '*' || *p == '+' || *p == '?')
+ goto invalid_preceding_re;
+
+ if (syntax & RE_NO_BK_BRACES)
+ {
+ if (*p == '{')
+ {
+ /* Close but not exactly as above. */
+
+ int lower_bound = -1;
+ int upper_bound = -1;
+
+ following_left_brace = p++;
+ GET_UNSIGNED_NUMBER (lower_bound);
+
+ if (c == ',')
+ {
+ GET_UNSIGNED_NUMBER (upper_bound);
+ if (upper_bound < 0)
+ upper_bound = RE_DUP_MAX;
+ }
+
+ if (upper_bound < 0)
+ upper_bound = lower_bound;
+
+ /* If not a valid interval, then we don't have
+ an interval operating on another one; what
+ we have instead is a series match-self ops
+ starting with a '{'. */
+
+ if (lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound || c != '}')
+ {
+ /* Back up to '{' so can use again
+ put it in C, as the normal_char label
+ code expects that; will go to that
+ label after putting the preceding valid
+ interval in the buffer. */
+
+ p = following_left_brace;
+ PATFETCH (c);
+ }
+ else
+ goto invalid_preceding_re;
+ }
+ }
+ else if (p[0] == '\\' && p[1] == '{')
+ goto invalid_preceding_re;
+ }
+
+
+ /* We just parsed a valid interval. */
+
+ /* If it's invalid to have no preceding re. */
+ if (!laststart)
+ {
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ goto missing_preceding_re;
+ else if (syntax & RE_CONTEXT_INDEP_OPS)
+ laststart = b;
+ else
+ goto unfetch_interval;
+ }
+ else if ((syntax & RE_REPEATED_ANCHORS_AWAY)
+ && (enum regexpcode) *laststart == start_memory)
+ remove_intervening_anchors (laststart, b, anchor_list, bufp);
+
+ /* If upper_bound is zero, don't want to succeed at all;
jump from laststart to b + 3, which will be the end of
the buffer after this jump is inserted. */
if (upper_bound == 0)
{
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &op_list);
GET_BUFFER_SPACE (3);
- insert_jump (jump, laststart, b + 3, b);
+ insert_jump (no_pop_jump, laststart, b + 3, b);
b += 3;
}
/* Otherwise, after lower_bound number of succeeds, jump
- to after the jump_n which will be inserted at the end
- of the buffer, and insert that jump_n. */
+ to after the no_pop_jump_n which will be inserted at the end
+ of the buffer, and insert that no_pop_jump_n. */
else
{ /* Set to 5 if only one repetition is allowed and
- hence no jump_n is inserted at the current end of
- the buffer; then only space for the succeed_n is
- needed. Otherwise, need space for both the
- succeed_n and the jump_n. */
+ hence no no_pop_jump_n is inserted at the current
+ end of the buffer. Otherwise, need 10 bytes total
+ for the succeed_n and the no_pop_jump_n. */
unsigned slots_needed = upper_bound == 1 ? 5 : 10;
@@ -1021,37 +1745,69 @@ re_compile_pattern (pattern, size, bufp)
be set by its attendant set_number_at, because
re_compile_fastmap will need to know it. Jump to
what the end of buffer will be after inserting
- this succeed_n and possibly appending a jump_n. */
+ this succeed_n and possibly appending a
+ no_pop_jump_n. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &op_list);
insert_jump_n (succeed_n, laststart, b + slots_needed,
b, lower_bound);
b += 5; /* Just increment for the succeed_n here. */
- /* More than one repetition is allowed, so put in at
+
+ /* More than one repetition is allowed, so put in at
the end of the buffer a backward jump from b to the
succeed_n we put in above. By the time we've gotten
to this jump when matching, we'll have matched once
already, so jump back only upper_bound - 1 times. */
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
if (upper_bound > 1)
{
- store_jump_n (b, jump_n, laststart, upper_bound - 1);
+ store_jump_n (b, no_pop_jump_n, laststart,
+ upper_bound - 1);
b += 5;
/* When hit this when matching, reset the
- preceding jump_n's n to upper_bound - 1. */
- BUFPUSH (set_number_at);
- GET_BUFFER_SPACE (2);
+ preceding no_pop_jump_n's n to upper_bound - 1. */
+
+ BUF_PUSH (set_number_at);
+
+ /* Only need to get space for the numbers. */
+ GET_BUFFER_SPACE (4);
STORE_NUMBER_AND_INCR (b, -5);
STORE_NUMBER_AND_INCR (b, upper_bound - 1);
}
- /* When hit this when matching, set the succeed_n's n. */
+ /* Otherwise, put in a no_op, so verify_and_adjust_endlines
+ can detect, e.g., a preceding `$' is not an anchor. */
+ else
+ BUF_PUSH (no_op);
+
+
+ /* When hit this when matching, set the succeed_n's n. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &op_list);
GET_BUFFER_SPACE (5);
insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
b += 5;
}
pending_exact = 0;
beg_interval = 0;
- break;
+
+ if (following_left_brace)
+ goto normal_char;
+ break;
unfetch_interval:
/* If an invalid interval, match the characters as literals. */
@@ -1063,64 +1819,88 @@ re_compile_pattern (pattern, size, bufp)
"regex: no interval beginning to which to backtrack.\n");
exit (1);
}
-
beg_interval = 0;
- PATFETCH (c); /* normal_char expects char in `c'. */
- goto normal_char;
- break;
+
+ /* normal_char and normal_backsl expect a character in `c'. */
+ PATFETCH (c);
+
+ if (!(syntax & RE_NO_BK_BRACES))
+ {
+ if (p > pattern && p[-1] == '\\')
+ goto normal_backsl;
+ }
+ goto normal_char;
#ifdef emacs
case '=':
- BUFPUSH (at_dot);
+ BUF_PUSH (at_dot);
break;
case 's':
laststart = b;
- BUFPUSH (syntaxspec);
PATFETCH (c);
- BUFPUSH (syntax_spec_code[c]);
+ BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
break;
case 'S':
laststart = b;
- BUFPUSH (notsyntaxspec);
PATFETCH (c);
- BUFPUSH (syntax_spec_code[c]);
+ BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
break;
#endif /* emacs */
case 'w':
laststart = b;
- BUFPUSH (wordchar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (wordchar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case 'W':
laststart = b;
- BUFPUSH (notwordchar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (notwordchar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case '<':
- BUFPUSH (wordbeg);
+ BUF_PUSH (wordbeg);
break;
case '>':
- BUFPUSH (wordend);
+ BUF_PUSH (wordend);
break;
case 'b':
- BUFPUSH (wordbound);
+ BUF_PUSH (wordbound);
break;
case 'B':
- BUFPUSH (notwordbound);
+ BUF_PUSH (notwordbound);
break;
case '`':
- BUFPUSH (begbuf);
+ BUF_PUSH (begbuf);
break;
case '\'':
- BUFPUSH (endbuf);
+ BUF_PUSH (endbuf);
break;
case '1':
@@ -1132,28 +1912,39 @@ re_compile_pattern (pattern, size, bufp)
case '7':
case '8':
case '9':
- if (obscure_syntax & RE_NO_BK_REFS)
+ if (syntax & RE_NO_BK_REFS)
goto normal_char;
+
c1 = c - '0';
- if (c1 >= regnum)
+
+ if (c1 >= regnum)
{
- if (obscure_syntax & RE_NO_EMPTY_BK_REF)
- goto invalid_pattern;
+ if (syntax & RE_NO_MISSING_BK_REF)
+ goto invalid_back_reference;
else
goto normal_char;
}
+
/* Can't back reference to a subexpression if inside of it. */
- for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
- if (*stackt == c1)
- goto normal_char;
- laststart = b;
- BUFPUSH (duplicate);
- BUFPUSH (c1);
- break;
+ if (is_in_compile_stack (compile_stack, c1))
+ goto normal_char;
+
+ laststart = b;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (duplicate, c1);
+
+ if (get_group_match_status (group_match_status, c1))
+ if (!set_this_level (&level_match_status, current_level))
+ goto memory_exhausted;
+
+ break;
case '+':
case '?':
- if (obscure_syntax & RE_BK_PLUS_QM)
+ if (syntax & RE_BK_PLUS_QM)
goto handle_plus;
else
goto normal_backsl;
@@ -1164,61 +1955,141 @@ re_compile_pattern (pattern, size, bufp)
/* You might think it would be useful for \ to mean
not to translate; but if we don't translate it
it will never match anything. */
- if (translate) c = translate[c];
- goto normal_char;
+
+ if (translate)
+ c = translate[c];
+
+ goto normal_char;
}
break;
- default:
- normal_char: /* Expects the character in `c'. */
- if (!pending_exact || pending_exact + *pending_exact + 1 != b
- || *pending_exact == 0177 || *p == '*' || *p == '^'
- || ((obscure_syntax & RE_BK_PLUS_QM)
+ default:
+
+ /* Expects the character in `c'! */
+ normal_char:
+ /* If no exactn currently being built. */
+ if (!pending_exact
+
+ /* If last exactn not at current position. */
+ || pending_exact + *pending_exact + 1 != b
+
+ || *pending_exact == 0177
+
+ /* If followed by a repetition operator. */
+ || *p == '*' || *p == '^'
+ || ((syntax & RE_BK_PLUS_QM)
? *p == '\\' && (p[1] == '+' || p[1] == '?')
: (*p == '+' || *p == '?'))
- || ((obscure_syntax & RE_INTERVALS)
- && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ || ((syntax & RE_INTERVALS)
+ && ((syntax & RE_NO_BK_BRACES)
? *p == '{'
: (p[0] == '\\' && p[1] == '{'))))
{
- laststart = b;
- BUFPUSH (exactn);
- pending_exact = b;
- BUFPUSH (0);
- }
- BUFPUSH (c);
- (*pending_exact)++;
- }
- }
+ /* Start building a new exactn. */
+
+ laststart = b;
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
- if (stackp != stackb) goto unmatched_open;
+ BUF_PUSH_2 (exactn, 0);
+ pending_exact = b - 1;
+
+ if (!set_this_level (&level_match_status, current_level))
+ goto memory_exhausted;
+ }
+ BUF_PUSH (c);
+ (*pending_exact)++;
+ break;
+
+ } /* end switch (c). */
+ } /* end while p!= pend. */
+
+ /* Through the pattern now. */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ if (!COMPILE_STACK_EMPTY)
+ goto unmatched_open;
+
+ /* Have to set this before calling the next routine. */
bufp->used = b - bufp->buffer;
+
+ if (!verify_and_adjust_endlines (op_list, group_match_status, bufp,
+ &enough_memory))
+ goto invalid_pattern;
+
+ if (!enough_memory)
+ goto memory_exhausted;
+
+
+ /* Normal return. */
return 0;
+
+ /* Abnormal return. */
+
invalid_pattern:
- return "Invalid regular expression";
+ bufp->used = b - bufp->buffer;
+ return "Invalid regular expression";
unmatched_open:
- return "Unmatched \\(";
+ bufp->used = b - bufp->buffer;
+ return "Unmatched ( or \\(";
unmatched_close:
- return "Unmatched \\)";
+ bufp->used = b - bufp->buffer;
+ return "Unmatched ) or \\)";
end_of_pattern:
- return "Premature end of regular expression";
-
- nesting_too_deep:
- return "Nesting too deep";
+ bufp->used = b - bufp->buffer;
+ return "Premature end of regular expression";
too_big:
- return "Regular expression too big";
+ bufp->used = b - bufp->buffer;
+ return "Regular expression too big";
memory_exhausted:
- return "Memory exhausted";
+ bufp->used = b - bufp->buffer;
+ return "Memory exhausted";
+
+ invalid_char_class:
+ bufp->used = b - bufp->buffer;
+ return "Invalid character class name";
+
+ unmatched_left_bracket:
+ bufp->used = b - bufp->buffer;
+ return "Unmatched [ or [^";
+
+ invalid_range_end:
+ bufp->used = b - bufp->buffer;
+ return "Invalid range end";
+
+ trailing_backslash:
+ bufp->used = b - bufp->buffer;
+ return "Trailing backslash";
+
+ unmatched_left_curly_brace:
+ bufp->used = b - bufp->buffer;
+ return "Unmatched \\{";
+
+ invalid_braces_content:
+ bufp->used = b - bufp->buffer;
+ return "Invalid content of \\{\\}";
+
+ missing_preceding_re:
+ bufp->used = b - bufp->buffer;
+ return "Missing preceding regular expression";
+
+ invalid_preceding_re:
+ bufp->used = b - bufp->buffer;
+ return "Invalid preceding regular expression";
+
+ invalid_back_reference:
+ bufp->used = b - bufp->buffer;
+ return "Invalid back reference";
}
@@ -1229,11 +2100,7 @@ re_compile_pattern (pattern, size, bufp)
static void
store_jump (from, opcode, to)
char *from, *to;
-#ifndef MSDOS
char opcode;
-#else
- int opcode;
-#endif /* MSDOS */
{
from[0] = opcode;
STORE_NUMBER(from + 1, to - (from + 3));
@@ -1248,11 +2115,7 @@ store_jump (from, opcode, to)
static void
insert_jump (op, from, to, current_end)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *from, *to, *current_end;
{
register char *pfrom = current_end; /* Copy from here... */
@@ -1275,11 +2138,7 @@ insert_jump (op, from, to, current_end)
static void
store_jump_n (from, opcode, to, n)
char *from, *to;
-#ifndef MSDOS
char opcode;
-#else
- int opcode;
-#endif /* MSDOS */
unsigned n;
{
from[0] = opcode;
@@ -1298,11 +2157,7 @@ store_jump_n (from, opcode, to, n)
static void
insert_jump_n (op, from, to, current_end, n)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *from, *to, *current_end;
unsigned n;
{
@@ -1323,11 +2178,7 @@ insert_jump_n (op, from, to, current_end, n)
static void
insert_op_2 (op, there, current_end, num_1, num_2)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *there, *current_end;
int num_1, num_2;
{
@@ -1343,52 +2194,849 @@ insert_op_2 (op, there, current_end, num_1, num_2)
}
+/* Compile stack routine for regex_compile. */
+
+/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
+ false if it's not. */
+
+static boolean
+is_in_compile_stack (compile_stack, regnum)
+ compile_stack_type compile_stack;
+ int regnum;
+{
+ int this_element;
+
+ if (COMPILE_STACK_EMPTY)
+ return false;
+
+ for (this_element = compile_stack.avail - 1;
+ this_element >= 0;
+ this_element--)
+ if (compile_stack.stack[this_element].regnum == regnum)
+ return true;
+
+ return false;
+}
+
+
+/* Pattern offsets list stuff. */
+
+/* Initializes a pattern offsets list PATTERN_OFFSETS_LIST_PTR to be
+ INIT_SIZE large.
+
+ Returns 1 if it can allocate the space and 0 if it can't. */
+
+static unsigned
+init_pattern_offsets_list (pattern_offsets_list_ptr, init_size)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+ int init_size;
+{
+ if (init_size < 0)
+ {
+ printf ("Can't initialize a pattern offsets list with a negative \
+or zero init_size %d.\n", init_size);
+ exit (1);
+ }
+ else
+ {
+ pattern_offsets_list_ptr->offsets
+ = (pattern_offset_type *) malloc (init_size
+ * sizeof (pattern_offset_type));
+
+ if (pattern_offsets_list_ptr->offsets == NULL)
+ return 0;
+
+ pattern_offsets_list_ptr->size = init_size;
+ pattern_offsets_list_ptr->avail = 0;
+ }
+ return 1;
+}
+
+
+/* Doubles the size of a pattern offsets list PATTERN_OFFSETS_LIST_PTR.
+
+ Returns 1 if it can allocate the space and 0 if it can't. */
+
+static unsigned
+double_pattern_offsets_list (pattern_offsets_list_ptr)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+{
+ pattern_offsets_list_ptr->offsets
+ = (pattern_offset_type *) realloc (pattern_offsets_list_ptr->offsets,
+ (pattern_offsets_list_ptr->size << 1) * sizeof (pattern_offset_type));
+
+ if (pattern_offsets_list_ptr->offsets == NULL)
+ return 0;
+
+ pattern_offsets_list_ptr->size <<= 1;
+ return 1;
+}
+
+
+/* Adds OFFSET to PATTERN_OFFSETS_LIST_PTR.
+
+ Returns 1 if it can add the offset and 0 if it needs to allocate
+ space for it and can't. */
+
+static unsigned
+add_pattern_offset (pattern_offsets_list_ptr, offset)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+ pattern_offset_type offset;
+{
+ if (PATTERN_OFFSETS_LIST_PTR_FULL (pattern_offsets_list_ptr))
+ if (!double_pattern_offsets_list (pattern_offsets_list_ptr))
+ return 0;
+
+ pattern_offsets_list_ptr->offsets[pattern_offsets_list_ptr->avail] = offset;
+ pattern_offsets_list_ptr->avail++;
+
+ return 1;
+}
+
+
+/* Adjust each offset in PATTERN_OFFSETS_LIST_PTR by INCREMENT. */
+
+static void
+adjust_pattern_offsets_list (increment, start_position,
+ pattern_offsets_list_ptr)
+ unsigned increment;
+ unsigned start_position;
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+{
+ unsigned this_pattern_offset = 0;
+
+ while (this_pattern_offset < pattern_offsets_list_ptr->avail
+ && pattern_offsets_list_ptr->offsets[this_pattern_offset]
+ < start_position)
+ this_pattern_offset++;
+
+ for (; this_pattern_offset < pattern_offsets_list_ptr->avail;
+ this_pattern_offset++)
+ pattern_offsets_list_ptr->offsets[this_pattern_offset] += increment;
+}
+
+
+/* Anchor routines for regex_compile. */
+
+/* If it's in a group, record in ANCHOR_LIST_PTR an anchor offset that's
+ at OFFSET.
+
+ Returns 1 if can put the offset in ANCHOR_LIST_PTR.
+ Returns 0 if runs out of memory allocating space for it. */
+
+static unsigned
+record_anchor_position (in_a_group, offset, anchor_list_ptr)
+ unsigned in_a_group;
+ pattern_offset_type offset;
+ anchor_list_type *anchor_list_ptr;
+{
+ if (in_a_group)
+ if (!add_pattern_offset (anchor_list_ptr, offset))
+ return 0;
+
+ return 1;
+}
+
+
+/* Set all `begline's between START and END in BUFP to `no_op's.
+ Set all such `endline's to either `endline_in_repeat's and all such
+ `endline_before_newline's to `repeated_endline_before_repeat's. */
+
+static void
+remove_intervening_anchors (start, end, anchor_list, bufp)
+ char *start, *end;
+ anchor_list_type anchor_list;
+ struct re_pattern_buffer *bufp;
+{
+ unsigned this_anchor = 0;
+
+ while (this_anchor < anchor_list.avail
+ && start - bufp->buffer <= anchor_list.offsets[this_anchor]
+ && anchor_list.offsets[this_anchor] <= end - bufp->buffer)
+ {
+ char *this_anchor_ptr
+ = bufp->buffer + anchor_list.offsets[this_anchor++];
+
+ *this_anchor_ptr = *this_anchor_ptr == endline
+ ? (char)endline_in_repeat
+ : *this_anchor_ptr == endline_before_newline
+ ? (char)repeated_endline_before_newline
+ : *this_anchor_ptr == begline
+ ? (char)no_op
+ : *this_anchor_ptr;
+ }
+}
+
+
+/* Op list stuff. */
+
+/* Add OP_OFFSET to OP_LIST_PTR.
+ Return 1 if can add it and 0 if can't allocate the space to do so. */
+
+static unsigned
+add_op (op_list_ptr, op_offset)
+ op_list_type *op_list_ptr;
+ pattern_offset_type op_offset;
+{
+ return add_pattern_offset (op_list_ptr, op_offset);
+}
+
+
+/* Verify that all `$'s in an entire pattern buffer BUFP are valid
+ anchors or ordinary characters. Either leave or change intermediate
+ forms of `$' anchor ops into `endline' or `exactn ...' where
+ appropriate.
+
+ Return true in ENOUGH_MEMORY if don't run out of space allocating
+ internal data structures.
+
+ Return from the routine true if the pattern is valid and false
+ if it isn't. */
+
+static boolean
+verify_and_adjust_endlines (op_list, group_forward_match_status,
+ bufp, enough_memory)
+ op_list_type op_list;
+ /* `duplicate' case needs this: which groups matched something;
+ set when went fowards through the pattern. */
+ bits_list_type group_forward_match_status;
+ struct re_pattern_buffer *bufp;
+ boolean *enough_memory;
+{
+ int this_op_offset; /* Has to be type int because decrementing it. */
+ /* See comments for analogous variables used for '^' in regex_compile. */
+
+ bits_list_type level_match_status;
+ unsigned current_level = 0;
+ bits_list_type group_match_status;
+ bits_list_type group_active_status;
+ char *bend = bufp->buffer + bufp->used;
+ char *previous_p = NULL;
+
+
+ if (!(init_bits_list (&level_match_status)
+ && init_bits_list (&group_match_status)
+ && init_bits_list (&group_active_status)))
+ {
+ *enough_memory = false;
+ return true;
+ }
+ else
+ *enough_memory = true;
+
+ for (this_op_offset = op_list.avail - 1; this_op_offset >= 0;
+ this_op_offset--)
+ {
+ char *p = bufp->buffer + op_list.offsets[this_op_offset];
+
+ if (!enough_memory)
+ break;
+
+ switch ((enum regexpcode) *p)
+ {
+ case endline:
+ case endline_in_repeat:
+ case endline_before_newline:
+ case repeated_endline_before_newline:
+
+ /* If the '$' must be at the pattern's end or else is
+ in a trailing position. */
+
+ if ((bufp->syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ ? p + 1 == bend
+ : ((bufp->syntax & RE_TIGHT_ALT)
+ ? p + 3 == bend /* Would have two following no_ops. */
+ : (*p == endline_before_newline
+ || *p == repeated_endline_before_newline
+ || no_levels_match_anything (level_match_status))))
+ {
+ if ((enum regexpcode) *p == endline_in_repeat
+ || (enum regexpcode) *p == repeated_endline_before_newline)
+ if (bufp->syntax & RE_REPEATED_ANCHORS_AWAY)
+ *p = no_op;
+ else
+ *p = endline;
+
+
+ /* If this is a trailing '$' in an empty alternative. */
+
+ if ((bufp->syntax & RE_NO_EMPTY_ALTS)
+
+ /* If there's an alternation op right before this `$'. */
+ && ((this_op_offset > 0
+ && *(bufp->buffer
+ + op_list.offsets[this_op_offset - 1])
+ == jump_past_next_alt)
+
+ /* Or this `$' is the only thing in the first
+ alternative of more than one of them. */
+
+ || ((this_op_offset == 0 /* It's first. */
+ /* Or it's right after an open-group op. */
+ || (this_op_offset > 0
+ && *(bufp->buffer
+ + op_list.offsets[this_op_offset - 1])
+ == start_memory))
+
+ /* And it's right before an alternation op. */
+ && previous_p != NULL
+ && *previous_p == jump_past_next_alt)))
+ return false;
+ }
+
+ else if (bufp->syntax & RE_CONTEXT_INVALID_ANCHORS)
+ return false;
+
+ else if (!(bufp->syntax & RE_CONTEXT_INDEP_ANCHORS))
+ {
+ p[0] = (char)exactn;
+ p[1] = (char)1;
+ p[2] = '$';
+ }
+
+ break;
+
+
+ /* Yes, start and stop_memory are switched because we're going
+ backwards through the pattern! */
+
+ case stop_memory:
+ increase_level (&current_level);
+
+ if (!make_group_active (&group_active_status, p[1]))
+ enough_memory = false;
+
+ break;
+
+ case start_memory:
+ if (get_level_match_status (level_match_status, current_level))
+ if (!set_next_lower_level (&level_match_status, current_level))
+ enough_memory = false;
+ else
+ {
+ decrease_level (&current_level);
+ make_group_inactive (&group_active_status, p[1]);
+ }
+
+ break;
+
+
+ /* Hit an alternative. */
+
+ case jump_past_next_alt:
+ if (lower_levels_match_nothing (level_match_status, current_level))
+ clear_this_and_higher_levels (&level_match_status,current_level);
+
+ break;
+
+ /* These below mean was followed by a repetition operator. */
+ case no_op:
+ case maybe_pop_jump:
+ case no_pop_jump_n:
+ if (bufp->syntax & RE_REPEATED_ANCHORS_AWAY)
+ break;
+ case charset:
+ case charset_not:
+ case wordchar:
+ case notwordchar:
+ case exactn:
+ case anychar:
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ enough_memory = false;;
+
+ break;
+
+ case duplicate:
+ /* Only set level_match_status if this back reference
+ refers to a nonempty group. */
+
+ if (get_group_match_status (group_forward_match_status, p[1]))
+ if (!set_this_level (&level_match_status, current_level))
+ enough_memory = false;
+
+ break;
+
+ default:
+ printf ("Found an unknown operator %u in compiled pattern.\n", *p);
+ }
+ previous_p = p;
+ }
+ return true;
+}
+
+
+
+/* Bits list routines. (See above for macros.) */
+
+/* Initialize BITS_LIST_PTR to have one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+static unsigned
+init_bits_list (bits_list_ptr)
+ bits_list_type *bits_list_ptr;
+{
+ bits_list_ptr->bits = (unsigned *) malloc (sizeof (unsigned));
+
+ if (bits_list_ptr->bits == NULL)
+ return 0;
+
+ bits_list_ptr->size = BITS_BLOCK_SIZE;
+ bits_list_ptr->bits[0] = 0;
+
+ return 1;
+}
+
+
+/* Extend BITS_LIST_PTR by one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+static unsigned
+extend_bits_list (bits_list_ptr)
+ bits_list_type *bits_list_ptr;
+{
+ bits_list_ptr->bits
+ = (unsigned *) realloc (bits_list_ptr->bits,
+ bits_list_ptr->size + sizeof (unsigned));
+
+ if (bits_list_ptr->bits == NULL)
+ return 0;
+
+ bits_list_ptr->size += BITS_BLOCK_SIZE;
+ bits_list_ptr->bits[(bits_list_ptr->size/BITS_BLOCK_SIZE) - 1] = 0;
+
+ return 1;
+}
+
+
+/* Get the bit value at a positive POSITION in BITS_LIST. */
+
+static unsigned
+get_bit (bits_list, position)
+ bits_list_type bits_list;
+ unsigned position;
+{
+ if (position < 0)
+ {
+ printf ("Tried to get a bit at position less than zero.\n");
+ exit (1);
+ }
+
+ if (position > bits_list.size - 1)
+ {
+ printf ("Getting bit value: position %d exceeds bits list size %d.\n",
+ position, bits_list.size);
+ exit (1);
+ }
+
+ return bits_list.bits[BITS_BLOCK (position)] & BITS_MASK (position);
+}
+
+
+/* Set the bit for a positive POSITION in BITS_LIST_PTR to VALUE, which,
+ in turn, can only be 0 or 1.
+
+ Returns 1 if can set the bit and 0 if ran out of memory allocating
+ (if necessary) room for it. */
+
+static unsigned
+set_bit_to_value (bits_list_ptr, position, value)
+ bits_list_type *bits_list_ptr;
+ unsigned position;
+ unsigned value;
+{
+ if (position < 0)
+ {
+ printf ("Tried to set a bit at position less than zero.\n");
+ exit (1);
+ }
+
+ if (position > bits_list_ptr->size - 1
+ && !extend_bits_list (bits_list_ptr))
+ return 0;
+
+ if (value == 1)
+ bits_list_ptr->bits[BITS_BLOCK (position)] |= BITS_MASK (position);
+ else if (value == 0)
+ bits_list_ptr->bits[BITS_BLOCK (position)] &= ~(BITS_MASK (position));
+ else
+ {
+ printf ("Invalid value %d to set a bit.\n");
+ exit (1);
+ }
+ return 1;
+}
+
+
+/* Level stuff. */
+
+
+/* Return 1 if LEVEL in LEVEL_MATCH_STATUS matches something and
+ 0 if it doesn't. Assumes LEVEL is positive. */
+
+static unsigned
+get_level_match_status (level_match_status, level)
+ bits_list_type level_match_status;
+ unsigned level;
+{
+ return get_bit (level_match_status, level);
+}
+
+
+/* Mark as matching something the level LEVEL in LEVEL_MATCH_STATUS_PTR.
+ Assumes LEVEL is positive.
+
+ Return 1 if can mark the level and 0 if need to allocate space for it
+ but can't. */
+
+static unsigned
+set_this_level (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ return set_bit_to_value (level_match_status_ptr, level, 1);
+}
+
+
+/* Mark as matching something the level below the LEVEL recorded in
+ LEVEL_MATCH_STATUS_PTR. Assumes LEVEL is greater than zero.
+
+ Return 1 if can mark the level and 0 ran out of memory trying to do so. */
+
+static unsigned
+set_next_lower_level (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ unsigned this_level;
+
+ return set_bit_to_value (level_match_status_ptr, level - 1, 1);
+}
+
+
+/* Mark as matching something the level LEVEL and all levels higher than
+ it currently in LEVEL_MATCH_STATUS_PTR. Assumes LEVEL is positive.
+
+ Return 1 if can mark the levels and 0 ran out of memory trying to do so. */
+
+static void
+clear_this_and_higher_levels (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ unsigned this_level;
+
+ for (this_level = level;
+ this_level < level_match_status_ptr->size;
+ this_level++)
+ set_bit_to_value (level_match_status_ptr, this_level, 0);
+}
+
+
+/* Returns true if none of the levels in LEVEL_MATCH_STATUS less than a
+ positive LEVEL match anything, and false otherwise. */
+
+static boolean
+lower_levels_match_nothing (level_match_status, level)
+ bits_list_type level_match_status;
+ unsigned level;
+{
+ unsigned this_level;
+
+ for (this_level = 0; this_level < level; this_level++)
+ if (get_bit (level_match_status, this_level))
+ return false;
+
+ return true;
+}
+
+/* Returns true if none of the levels in LEVEL_MATCH_STATUS match
+ anything, and false otherwise. */
+
+static boolean
+no_levels_match_anything (level_match_status)
+ bits_list_type level_match_status;
+{
+ unsigned this_bits_block;
+
+ for (this_bits_block = 0;
+ this_bits_block < level_match_status.size/BITS_BLOCK_SIZE;
+ this_bits_block++)
+ if (level_match_status.bits[this_bits_block] != 0)
+ return false;
+
+ return true;
+}
+
+
+/* Increase CURRENT_LEVEL_PTR. */
+
+static void
+increase_level (current_level_ptr)
+ unsigned *current_level_ptr;
+{
+ (*current_level_ptr)++;
+}
+
+
+/* Decrease CURRENT_LEVEL_PTR, but exit on error if try to decrease
+ below zero. */
+
+static void
+decrease_level (current_level_ptr)
+ unsigned *current_level_ptr;
+{
+ if (*current_level_ptr == 0)
+ {
+ printf ("Tried to decrease current level below zero.\n");
+ exit (1);
+ }
+ (*current_level_ptr)--;
+}
+
+
+/* Group stuff. */
+
+
+/* Mark a positive GROUP in GROUP_ACTIVE_STATUS_PTR as active.
+ Return 1 if can mark the group and 0 ran out of memory trying to do so. */
+
+static unsigned
+make_group_active (group_active_status_ptr, group)
+ bits_list_type *group_active_status_ptr;
+ unsigned group;
+{
+ return set_bit_to_value (group_active_status_ptr, group, 1);
+}
+
+
+/* Mark a positive GROUP in GROUP_ACTIVE_STATUS_PTR as inactive.
+ Return 1 if can mark the group and 0 ran out of memory trying to do so. */
+
+static unsigned
+make_group_inactive (group_active_status_ptr, group)
+ bits_list_type *group_active_status_ptr;
+ unsigned group;
+{
+ return set_bit_to_value (group_active_status_ptr, group, 0);
+}
+
+
+/* Mark as active in GROUP_MATCH_STATUS_PTR those active groups recorded
+ in GROUP_ACTIVE_STATUS_PTR.
+
+ Return 1 if can mark the groups and 0 ran out of memory trying to do so. */
+
+static unsigned
+set_match_status_of_active_groups (group_active_status, group_match_status_ptr)
+ bits_list_type group_active_status;
+ bits_list_type *group_match_status_ptr;
+{
+ unsigned this_bit_block;
+
+ if (group_active_status.size > group_match_status_ptr->size
+ && !extend_bits_list (group_match_status_ptr))
+ return 0;
+
+ for (this_bit_block = 0;
+ this_bit_block < group_active_status.size/BITS_BLOCK_SIZE;
+ this_bit_block++)
+ group_match_status_ptr->bits[this_bit_block]
+ |= group_active_status.bits[this_bit_block];
+
+ return 1;
+}
+
+
+/* Return 1 if GROUP in GROUP_MATCH_STATUS matches something and
+ 0 if it doesn't. Assumes GROUP is positive. */
+
+static unsigned
+get_group_match_status (group_match_status, group)
+ bits_list_type group_match_status;
+ unsigned group;
+{
+ return get_bit (group_match_status, group);
+}
+
+
+
+
+/* Failure stack declarations and macros for both re_compile_fastmap and
+ re_match_2. Have to use `alloca' for reasons stated in INIT_BITS_LIST's
+ comment. */
+
+
+/* Roughly the maximum number of failure points on the stack. Would be
+ exactly that if always used MAX_FAILURE_SPACE each time we failed. */
+
+int re_max_failures = 2000;
+
+
+typedef unsigned char *failure_stack_element;
+
+typedef struct {
+ failure_stack_element *stack;
+ unsigned size;
+ unsigned avail; /* Offset of next open position. */
+ } failure_stack_type;
+
+
+#define FAILURE_STACK_EMPTY (failure_stack.avail == 0)
+#define FAILURE_STACK_PTR_EMPTY (failure_stack_ptr->avail == 0)
+#define FAILURE_STACK_FULL (failure_stack.avail == failure_stack.size)
+
+
+/* Initialize a failure stack.
+
+ Return 1 if was able to allocate the space for (FAILURE_STACK) and
+ 0 if not. */
+
+#define INIT_FAILURE_STACK(failure_stack) \
+ ((failure_stack).stack = (failure_stack_element *) \
+ REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (failure_stack_element)),\
+ \
+ (failure_stack).stack == NULL \
+ ? 0 \
+ : ((failure_stack).size = INIT_FAILURE_ALLOC, \
+ (failure_stack).avail = 0, \
+ 1))
+
+
+/* Double the size of FAILURE_STACK, up to MAX_SIZE.
+
+ Return 1 if was able to double it, and 0 if either ran out of memory
+ allocating space for it or it was already MAX_SIZE large.
+
+ REGEX_REALLOCATE requires `void *destination' be declared. */
+
+#define DOUBLE_FAILURE_STACK(failure_stack, max_size) \
+ ((failure_stack).size > max_size \
+ ? 0 \
+ : ((failure_stack).stack = (failure_stack_element *) \
+ REGEX_REALLOCATE ((failure_stack).stack, \
+ ((failure_stack).size << 1) * sizeof (failure_stack_element)),\
+ \
+ (failure_stack).stack == NULL \
+ ? 0 \
+ : ((failure_stack).size <<= 1, \
+ 1)))
+
+
+/* Push PATTERN_OP on (FAILURE_STACK).
+
+ Return 1 if was able to do so and 0 if ran out of memory allocating
+ space to do so.
+
+ DOUBLE_FAILURE_STACK requires `void *destination' be declared. */
+
+#define PUSH_PATTERN_OP(pattern_op, failure_stack) \
+ ((FAILURE_STACK_FULL \
+ && !DOUBLE_FAILURE_STACK (failure_stack, re_max_failures)) \
+ ? 0 \
+ : ((failure_stack).stack[(failure_stack).avail++] = pattern_op, \
+ 1))
+
+
+/* Push most of the information about the state we will want
+ if we ever fail back to it.
+
+ Requires regstart, regend, reg_info, and num_internal_regs be declared.
+ DOUBLE_FAILURE_STACK requires `void *destination' be declared.
+
+ Does a `return FAILURE_CODE' if runs out of memory. */
+
+#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_stack, failure_code) \
+ do { \
+ long highest_used_reg, this_reg; \
+ void *destination; \
+ \
+ /* Find out how many registers are active or have been matched. \
+ (Aside from register zero, which is only set at the end.) */ \
+ \
+ for (highest_used_reg = num_internal_regs - 1; highest_used_reg > 0;\
+ highest_used_reg--) \
+ if (regstart[highest_used_reg] != (unsigned char *) -1) \
+ break; \
+ \
+ while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
+ if (!DOUBLE_FAILURE_STACK (failure_stack, \
+ re_max_failures * MAX_FAILURE_ITEMS)) \
+ return failure_code; \
+ \
+ /* Now push the info for each of those registers. */ \
+ \
+ for (this_reg = 1; this_reg <= highest_used_reg; this_reg++) \
+ { \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = regstart[this_reg]; \
+ \
+ (failure_stack).stack[(failure_stack).avail++] = regend[this_reg];\
+ \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = (unsigned char *) &reg_info[this_reg]; \
+ } \
+ \
+ /* Push how many registers we saved. */ \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = (unsigned char *) highest_used_reg; \
+ \
+ (failure_stack).stack[(failure_stack).avail++] = pattern_place; \
+ (failure_stack).stack[(failure_stack).avail++] = string_place; \
+ } while (0)
+
+
+
+
/* Given a pattern, compute a fastmap from it. The fastmap records
which of the (1 << BYTEWIDTH) possible characters can start a string
that matches the pattern. This fastmap is used by re_search to skip
- quickly over totally implausible text.
+ quickly over totally impossible text.
The caller must supply the address of a (1 << BYTEWIDTH)-byte data
area as bufp->fastmap.
- The other components of bufp describe the pattern to be used. */
+ The other components of bufp describe the pattern to be used.
+
+ Returns 0 if it can compile a fastmap.
+ Returns -2 if there is an internal error. */
-void
+int
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
{
unsigned char *pattern = (unsigned char *) bufp->buffer;
int size = bufp->used;
register char *fastmap = bufp->fastmap;
- register unsigned char *p = pattern;
+ unsigned char *p = pattern;
register unsigned char *pend = pattern + size;
- register int j, k;
+ int j, k;
unsigned char *translate = (unsigned char *) bufp->translate;
- unsigned is_a_succeed_n;
+ failure_stack_type failure_stack;
+ void *destination;
-#ifndef NO_ALLOCA
- unsigned char *stackb[NFAILURES];
- unsigned char **stackp = stackb;
-#else
- unsigned char **stackb;
- unsigned char **stackp;
- stackb = (unsigned char **) malloc (NFAILURES * sizeof (unsigned char *));
- stackp = stackb;
+ INIT_FAILURE_STACK (failure_stack);
-#endif /* NO_ALLOCA */
- memset (fastmap, 0, (1 << BYTEWIDTH));
+ bzero (fastmap, (1 << BYTEWIDTH));
bufp->fastmap_accurate = 1;
bufp->can_be_null = 0;
while (p)
{
- is_a_succeed_n = 0;
+ boolean is_a_succeed_n = false;
+
if (p == pend)
- {
- bufp->can_be_null = 1;
- break;
- }
+ if (FAILURE_STACK_EMPTY)
+ {
+ bufp->can_be_null = 1;
+ break;
+ }
+ else
+ p = failure_stack.stack[--failure_stack.avail];
+
+
#ifdef SWITCH_ENUM_BUG
switch ((int) ((enum regexpcode) *p++))
#else
@@ -1396,10 +3044,7 @@ re_compile_fastmap (bufp)
#endif
{
case exactn:
- if (translate)
- fastmap[translate[p[1]]] = 1;
- else
- fastmap[p[1]] = 1;
+ fastmap[translate ? translate[p[1]] : p[1]] = 1;
break;
case begline:
@@ -1415,55 +3060,63 @@ re_compile_fastmap (bufp)
continue;
case endline:
- if (translate)
- fastmap[translate['\n']] = 1;
- else
- fastmap['\n'] = 1;
+ fastmap[translate ? translate['\n'] : '\n'] = 1;
- if (bufp->can_be_null != 1)
+ if (! bufp->can_be_null)
bufp->can_be_null = 2;
break;
- case jump_n:
- case finalize_jump:
- case maybe_finalize_jump:
- case jump:
+ case no_pop_jump_n:
+ case pop_failure_jump:
+ case maybe_pop_jump:
+ case no_pop_jump:
+ case jump_past_next_alt:
case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
+ extract_number_and_incr (&j, &p);
p += j;
if (j > 0)
continue;
+
/* Jump backward reached implies we just went through
- the body of a loop and matched nothing.
- Opcode jumped to should be an on_failure_jump.
- Just treat it like an ordinary jump.
- For a * loop, it has pushed its failure point already;
- If so, discard that as redundant. */
+ the body of a loop and matched nothing. Opcode jumped to
+ should be an on_failure_jump or succeed_n. Just treat it
+ like an ordinary jump. For a * loop, it has pushed its
+ failure point already; If so, discard that as redundant. */
if ((enum regexpcode) *p != on_failure_jump
&& (enum regexpcode) *p != succeed_n)
continue;
+
p++;
- EXTRACT_NUMBER_AND_INCR (j, p);
+ extract_number_and_incr (&j, &p);
p += j;
- if (stackp != stackb && *stackp == p)
- stackp--;
+
+ /* If what's on the stack is where we are now, pop it. */
+
+ if (!FAILURE_STACK_EMPTY
+ && failure_stack.stack[failure_stack.avail - 1] == p)
+ failure_stack.avail--;
+
continue;
case on_failure_jump:
handle_on_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
- *++stackp = p + j;
- if (is_a_succeed_n)
- EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
- continue;
+ extract_number_and_incr (&j, &p);
+
+ if (!PUSH_PATTERN_OP (p + j, failure_stack))
+ return -2;
+
+ if (is_a_succeed_n)
+ extract_number_and_incr (&k, &p); /* Skip the n. */
+
+ continue;
case succeed_n:
- is_a_succeed_n = 1;
+ is_a_succeed_n = true;
/* Get to the number of times to succeed. */
p += 2;
/* Increment p past the n for when k != 0. */
- EXTRACT_NUMBER_AND_INCR (k, p);
+ extract_number_and_incr (&k, &p);
if (k == 0)
{
p -= 4;
@@ -1488,9 +3141,7 @@ re_compile_fastmap (bufp)
if (j != '\n')
fastmap[j] = 1;
if (bufp->can_be_null)
- {
- FREE_AND_RETURN_VOID(stackb);
- }
+ return 0;
/* Don't return; check the alternative paths
so we can set can_be_null if appropriate. */
break;
@@ -1523,47 +3174,37 @@ re_compile_fastmap (bufp)
break;
#endif /* not emacs */
- case charset:
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
+ case charset:
+ for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- {
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
- }
+ fastmap[translate ? translate[j] : j] = 1;
break;
case charset_not:
- /* Chars beyond end of map must be allowed */
+ /* Chars beyond end of map must be allowed. */
for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
+ fastmap[translate ? translate[j] : j] = 1;
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
- {
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
- }
- break;
- }
+ fastmap[translate ? translate[j] : j] = 1;
- /* Get here means we have successfully found the possible starting
+ break;
+ } /* End switch *p++. */
+
+ /* Getting here means we have successfully found the possible starting
characters of one path of the pattern. We need not follow this
path any farther. Instead, look at the next alternative
remembered in the stack. */
- if (stackp != stackb)
- p = *stackp--;
+
+ if (!FAILURE_STACK_EMPTY)
+ p = failure_stack.stack[--failure_stack.avail];
else
break;
}
- FREE_AND_RETURN_VOID(stackb);
-}
+ return 0;
+} /* re_compile_fastmap */
+
@@ -1571,110 +3212,123 @@ re_compile_fastmap (bufp)
doesn't let you say where to stop matching. */
int
-re_search (pbufp, string, size, startpos, range, regs)
- struct re_pattern_buffer *pbufp;
- char *string;
- int size, startpos, range;
+re_search (bufp, string, size, startpos, range, regs)
+ struct re_pattern_buffer *bufp;
+ const char *string;
+ const int size, startpos, range;
struct re_registers *regs;
{
- return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
+ return re_search_2 (bufp, (char *) 0, 0, string, size, startpos, range,
regs, size);
}
-/* Using the compiled pattern in PBUFP->buffer, first tries to match the
+/* Using the compiled pattern in BUFP->buffer, first tries to match the
virtual concatenation of STRING1 and STRING2, starting first at index
STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
places to try before giving up. If RANGE is negative, it searches
- backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
- - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
+ backwards, i.e., the starting positions tried are STARTPOS, STARTPOS - 1,
+ etc. STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
In REGS, return the indices of the virtual concatenation of STRING1
- and STRING2 that matched the entire PBUFP->buffer and its contained
+ and STRING2 that matched the entire BUFP->buffer and its contained
subexpressions. Do not consider matching one past the index MSTOP in
the virtual concatenation of STRING1 and STRING2.
The value returned is the position in the strings at which the match
- was found, or -1 if no match was found, or -2 if error (such as
+ was found, -1 if no match was found, or -2 if error (such as
failure stack overflow). */
int
-re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
- regs, mstop)
- struct re_pattern_buffer *pbufp;
- char *string1, *string2;
- int size1, size2;
- int startpos;
- register int range;
+re_search_2 (bufp, string1, size1, string2, size2, startpos, range,
+ regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ const int size1, size2;
+ const int startpos;
+ const int range;
struct re_registers *regs;
- int mstop;
+ const int stop;
{
- register char *fastmap = pbufp->fastmap;
- register unsigned char *translate = (unsigned char *) pbufp->translate;
+ register char *fastmap = bufp->fastmap;
+ register unsigned char *translate = (unsigned char *) bufp->translate;
int total_size = size1 + size2;
- int endpos = startpos + range;
+ int private_startpos = startpos;
+ int private_endpos = startpos + range;
+ int private_range = range;
int val;
+ const struct re_pattern_buffer *private_bufp;
/* Check for out-of-range starting position. */
- if (startpos < 0 || startpos > total_size)
+ if (private_startpos < 0 || private_startpos > total_size)
return -1;
- /* Fix up range if it would eventually take startpos outside of the
- virtual concatenation of string1 and string2. */
- if (endpos < -1)
- range = -1 - startpos;
- else if (endpos > total_size)
- range = total_size - startpos;
-
- /* Update the fastmap now if not correct already. */
- if (fastmap && !pbufp->fastmap_accurate)
- re_compile_fastmap (pbufp);
+ /* Fix up range if it would eventually take private_startpos outside
+ of the virtual concatenation of string1 and string2. */
+
+ if (private_endpos < -1)
+ private_range = -1 - private_startpos;
+
+ else if (private_endpos > total_size)
+ private_range = total_size - private_startpos;
+
+
+/* Update the fastmap now if not correct already. */
+ if (fastmap && !bufp->fastmap_accurate)
+ if (re_compile_fastmap (bufp) == -2)
+ return -2;
/* If the search isn't to be a backwards one, don't waste time in a
long search for a pattern that says it is anchored. */
- if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
- && range > 0)
+ if (bufp->used > 0 && (enum regexpcode) bufp->buffer[0] == begbuf
+ && private_range > 0)
{
- if (startpos > 0)
+ if (private_startpos > 0)
return -1;
else
- range = 1;
+ private_range = 1;
}
+ private_bufp = bufp;
+
while (1)
{
/* If a fastmap is supplied, skip quickly over characters that
cannot possibly be the start of a match. Note, however, that
- if the pattern can possibly match the null string, we must
- test it at each starting point so that we take the first null
- string we get. */
+ if the pattern can possibly match the null string, we don't
+ want to skip over characters; we want the first null string we
+ can match. */
- if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
+ if (fastmap && private_startpos < total_size && !bufp->can_be_null)
{
- if (range > 0) /* Searching forwards. */
+ if (private_range > 0) /* Searching forwards. */
{
register int lim = 0;
register unsigned char *p;
- int irange = range;
- if (startpos < size1 && startpos + range >= size1)
- lim = range - (size1 - startpos);
+ int irange = private_range;
+
+ if (private_startpos < size1
+ && private_startpos + private_range >= size1)
+ lim = private_range - (size1 - private_startpos);
p = ((unsigned char *)
- &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
+ &(private_startpos >= size1
+ ? string2 - size1
+ : string1)[private_startpos]);
- while (range > lim && !fastmap[translate
+ while (private_range > lim && !fastmap[translate
? translate[*p++]
: *p++])
- range--;
- startpos += irange - range;
+ private_range--;
+ private_startpos += irange - private_range;
}
else /* Searching backwards. */
{
register unsigned char c;
- if (string1 == 0 || startpos >= size1)
- c = string2[startpos - size1];
+ if (size1 == 0 || private_startpos >= size1)
+ c = string2[private_startpos - size1];
else
- c = string1[startpos];
+ c = string1[private_startpos];
c &= 0xff;
if (translate ? !fastmap[translate[c]] : !fastmap[c])
@@ -1682,35 +3336,30 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
}
}
- if (range >= 0 && startpos == total_size
- && fastmap && pbufp->can_be_null == 0)
+ if (private_range >= 0 && private_startpos == total_size
+ && fastmap && bufp->can_be_null == 0)
return -1;
- val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
- regs, mstop);
+ val = re_match_2 (private_bufp, string1, size1, string2, size2,
+ private_startpos, regs, stop);
if (val >= 0)
- return startpos;
+ return private_startpos;
+
if (val == -2)
return -2;
-#ifndef NO_ALLOCA
-#ifdef C_ALLOCA
- alloca (0);
-#endif /* C_ALLOCA */
-
-#endif /* NO_ALLOCA */
advance:
- if (!range)
+ if (!private_range)
break;
- else if (range > 0)
+ else if (private_range > 0)
{
- range--;
- startpos++;
+ private_range--;
+ private_startpos++;
}
else
{
- range++;
- startpos--;
+ private_range++;
+ private_startpos--;
}
}
return -1;
@@ -1720,115 +3369,118 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
#ifndef emacs /* emacs never uses this. */
int
-re_match (pbufp, string, size, pos, regs)
- struct re_pattern_buffer *pbufp;
- char *string;
- int size, pos;
+re_match (bufp, string, size, pos, regs)
+ const struct re_pattern_buffer *bufp;
+ const char *string;
+ const int size, pos;
struct re_registers *regs;
{
- return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
+ return re_match_2 (bufp, (char *) 0, 0, string, size, pos, regs, size);
}
#endif /* not emacs */
-/* The following are used for re_match_2, defined below: */
+
+/* Routines for re_match_2, defined below. */
-/* Roughly the maximum number of failure points on the stack. Would be
- exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
-
-int re_max_failures = 2000;
+static boolean group_can_match_nothing ();
+static int bcmp_translate ();
-/* Routine used by re_match_2. */
-static int memcmp_translate ();
+/* Macros used by re_match_2, defined below: */
/* Structure and accessing macros used in re_match_2: */
-struct register_info
+typedef struct register_info
{
+ bits_list_type inner_groups; /* Which groups are inside this one. */
+ int can_match_nothing; /* Set if this group can match nothing;
+ -1 if not ever set. */
unsigned is_active : 1;
unsigned matched_something : 1;
-};
+ unsigned ever_matched_something : 1;
+} reg_info_type;
+
+/* Macros used by re_match_2: */
+/* I.e., regstart, regend, and reg_info. */
+
+#define INNER_GROUPS(R) ((R).inner_groups)
+#define CAN_MATCH_NOTHING(R) ((R).can_match_nothing)
#define IS_ACTIVE(R) ((R).is_active)
#define MATCHED_SOMETHING(R) ((R).matched_something)
+#define EVER_MATCHED_SOMETHING(R) ((R).ever_matched_something)
-/* Macros used by re_match_2: */
+/* Record that group INNER is inside of all currently active groups. */
+#define NOTE_INNER_GROUP(inner) \
+ do { unsigned this_reg; \
+ for (this_reg = 0; this_reg < num_internal_regs; this_reg++) \
+ { \
+ void *destination; /* For SET_BIT_TO_VALUE. */ \
+ int ret = SET_BIT_TO_VALUE (INNER_GROUPS (reg_info[this_reg]), \
+ inner, \
+ IS_ACTIVE(reg_info[this_reg])); \
+ if (ret == 0) \
+ { \
+ printf ("Ran out of memory in re_match_2 (NOTE_INNER_GROUP).\n");\
+ exit (1); \
+ } \
+ if (ret != 1) \
+ { \
+ printf ("Invalid value %d to set a bit.\n", ret); \
+ exit (1); \
+ } \
+ } \
+ } while (0)
-/* I.e., regstart, regend, and reg_info. */
-#define NUM_REG_ITEMS 3
+/* Call this when have matched something; it sets `matched' flags for the
+ registers corresponding to the group of which we currently are inside.
+ Also records whether this group ever matched something. */
-/* We push at most this many things on the stack whenever we
- fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
- arguments to the PUSH_FAILURE_POINT macro. */
+#define SET_REGS_MATCHED \
+ do { unsigned this_reg; \
+ for (this_reg = 0; this_reg < num_internal_regs; this_reg++) \
+ { \
+ MATCHED_SOMETHING (reg_info[this_reg]) = \
+ EVER_MATCHED_SOMETHING (reg_info[this_reg]) = \
+ (IS_ACTIVE (reg_info[this_reg])) ? 1 : 0; \
+ } \
+ } while (0)
-#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
-/* We push this many things on the stack whenever we fail. */
+/* Failure stack macros for re_match_2. */
-#define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
+/* This is the number of items that are pushed and popped on the stack
+ for each register, i.e., its REGSTART, REGEND and REG_INFO. */
+#define NUM_REG_ITEMS 3
-/* This pushes most of the information about the current state we will want
- if we ever fail back to it. */
+/* Refers to highest_used_reg (which we calculate), PATTERN_PLACE and
+ STRING_PLACE, which are arguments to the PUSH_FAILURE_POINT macro. */
+
+#define NUM_OTHER_ITEMS 3
-#define PUSH_FAILURE_POINT(pattern_place, string_place) \
- { \
- short last_used_reg, this_reg; \
- \
- /* Find out how many registers are active or have been matched. \
- (Aside from register zero, which is only set at the end.) */ \
- for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
- if (regstart[last_used_reg] != (unsigned char *) -1) \
- break; \
- \
- if (stacke - stackp < NUM_FAILURE_ITEMS) \
- { \
- unsigned char **stackx; \
- unsigned int len = stacke - stackb; \
- if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
- { \
- FREE_AND_RETURN(stackb,(-2)); \
- } \
- \
- /* Roughly double the size of the stack. */ \
- stackx = DOUBLE_STACK(stackx,stackb,len); \
- /* Rearrange the pointers. */ \
- stackp = stackx + (stackp - stackb); \
- stackb = stackx; \
- stacke = stackb + 2 * len; \
- } \
- \
- /* Now push the info for each of those registers. */ \
- for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
- { \
- *stackp++ = regstart[this_reg]; \
- *stackp++ = regend[this_reg]; \
- *stackp++ = (unsigned char *) &reg_info[this_reg]; \
- } \
- \
- /* Push how many registers we saved. */ \
- *stackp++ = (unsigned char *) last_used_reg; \
- \
- *stackp++ = pattern_place; \
- *stackp++ = string_place; \
- }
-
+/* We put at most these many items on the stack whenever we push a
+ failure point . */
-/* This pops what PUSH_FAILURE_POINT pushes. */
+#define MAX_FAILURE_ITEMS \
+ (num_internal_regs * NUM_REG_ITEMS + NUM_OTHER_ITEMS)
+
+
+/* We really push this many items when pushing a failure point. We
+ calculate highest_used_reg each time. */
+
+#define NUM_FAILURE_ITEMS \
+ (highest_used_reg * NUM_REG_ITEMS + NUM_OTHER_ITEMS)
+
+/* How many items can still be added to the stack without overflowing it. */
+#define REMAINING_AVAIL_SLOTS \
+ (failure_stack.size - failure_stack.avail)
-#define POP_FAILURE_POINT() \
- { \
- int temp; \
- stackp -= 2; /* Remove failure points. */ \
- temp = (int) *--stackp; /* How many regs pushed. */ \
- temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
- stackp -= temp; /* Remove the register info. */ \
- }
#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
@@ -1854,19 +3506,6 @@ struct register_info
}
-/* Call this when have matched something; it sets `matched' flags for the
- registers corresponding to the subexpressions of which we currently
- are inside. */
-#define SET_REGS_MATCHED \
- { unsigned this_reg; \
- for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
- { \
- if (IS_ACTIVE(reg_info[this_reg])) \
- MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
- else \
- MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
- } \
- }
/* Test if at very beginning or at very end of the virtual concatenation
of string1 and string2. If there is only one string, we've put it in
@@ -1884,58 +3523,151 @@ struct register_info
2) if we're before the beginning of string2, we have to look at the
last character in string1; we assume there is a string1, so use
this in conjunction with AT_STRINGS_BEG. */
+
#define IS_A_LETTER(d) \
(SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
== Sword)
+#ifdef REGEX_MALLOC
+#define FREE_VARIABLES \
+ do { \
+ free (failure_stack.stack); \
+ free (regstart); \
+ free (regend); \
+ free (old_regstart); \
+ free (old_regend); \
+ free (reg_info); \
+ free (best_regstart); \
+ free (best_regend); \
+ reg_info = NULL; \
+ failure_stack.stack = NULL; \
+ regstart = regend = old_regstart = old_regend \
+ = best_regstart = best_regend = NULL; \
+ } while (0)
+#endif
+
+
+
+/* The main matching routine, re_match_2. */
+
+static void pop_failure_point();
+
-/* Match the pattern described by PBUFP against the virtual
- concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
- respectively. Start the match at index POS in the virtual
- concatenation of STRING1 and STRING2. In REGS, return the indices of
- the virtual concatenation of STRING1 and STRING2 that matched the
- entire PBUFP->buffer and its contained subexpressions. Do not
- consider matching one past the index MSTOP in the virtual
- concatenation of STRING1 and STRING2.
+/* re_match_2 matches a buffer full of byte commands for matching (gotten
+ from compiling a regular expression) and matches it against the
+ the virtual concatenation of its two string arguments.
+
+ BUFP is a struct re_pattern_buffer * whose pertinent fields are
+ mentioned below:
+
+ It has a char * field BUFFER which points to the byte
+ commands which make up the compiled pattern.
+
+ Its char * field TRANSLATE, if not 0, translates all
+ ordinary elements in the compiled pattern.
- If pbufp->fastmap is nonzero, then it had better be up to date.
+ Its int field SYNTAX is the syntax with which the pattern
+ was compiled and hence should be matched with.
+
+ The long field USED is how many bytes long the compiled
+ pattern is.
+
+ Its size_t field RE_NSUB contains how many subexpressions
+ the pattern has.
+
+ It ignores its NO_SUB bit.
+
+ If its RETURN_DEFAULT_NUM_REGS bit is set, then if REGS is
+ nonzero, re_match_2 reports in REGS->start[i] and
+ REGS->end[i], for i = 1 to BUFP->RE_NSUB + 1, which
+ substring of the virtual concatenation of STRING1 and
+ STRING2 matched the i-th subexpression of the regular
+ expression compiled in BUFFER; it records in REGS->start[0]
+ and REGS->end[0] information about all of that
+ concatenation. If RETURN_DEFAULT_NUM_REGS isn't set,
+ re_match_2 returns in REGS similar information about i
+ things for i = 1 to REGS->num_regs. If REGS is zero,
+ re_match_2 ignores it. See the comment for `struct
+ re_registers' for more details.
+
+ STRING1 and STRING2
+ are the addresses of the strings of which re_match_2 tries
+ to match the virtual concatenation. Because of this
+ concatenation, this function can be used on an Emacs
+ buffer's contents.
+
+ SIZE1 is the size of STRING1.
- The reason that the data to match are specified as two components
- which are to be regarded as concatenated is so this function can be
- used directly on the contents of an Emacs buffer.
+ SIZE2 is the size of STRING2.
+
+ POS is the index in the virtual concatenation of STRING1 and
+ STRING2 at which re_match_2 tries to start the match.
+
+ REGS is a struct re_registers *. If it's not zero, then
+ re_match_2 will fill its fields START and END with
+ information about what substrings of the virtual
+ concatenation of STRING1 and STRING2 were matched by the
+ groups represented in BUFP's BUFFER field. You must have
+ allocated the correct amount of space in the `start' and
+ `end' fields of REGS to accommodate `num_regs' (the other
+ field) registers. See the comment for `struct re_registers'
+ in regex.h for more details.
+
+ STOP is the index in the virtual concatenation of STRING1 and
+ STRING2 beyond which re_match_2 won't consider matching.
- -1 is returned if there is no match. -2 is returned if there is an
- error (such as match stack overflow). Otherwise the value is the
- length of the substring which was matched. */
+ It returns -1 if there is no match, -2 if there is an internal error
+ (such as its stack overflowing). Otherwise, it returns the length of
+ the substring it matched. */
int
-re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
- struct re_pattern_buffer *pbufp;
- char *string1_arg, *string2_arg;
- int size1, size2;
- int pos;
+re_match_2 (bufp, string1_arg, size1_arg, string2_arg, size2_arg, pos,
+ regs, stop)
+ const struct re_pattern_buffer *bufp;
+ const char *string1_arg;
+ const int size1_arg;
+ const char *string2_arg;
+ const int size2_arg;
+ const int pos;
struct re_registers *regs;
- int mstop;
+ const int stop;
{
- register unsigned char *p = (unsigned char *) pbufp->buffer;
+ unsigned char *p = (unsigned char *) bufp->buffer;
+ unsigned char *p1;
/* Pointer to beyond end of buffer. */
- register unsigned char *pend = p + pbufp->used;
+ register unsigned char *pend = p + bufp->used;
unsigned char *string1 = (unsigned char *) string1_arg;
unsigned char *string2 = (unsigned char *) string2_arg;
+ int size1 = size1_arg;
+ int size2 = size2_arg;
unsigned char *end1; /* Just past end of first string. */
unsigned char *end2; /* Just past end of second string. */
+
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
unsigned char *end_match_1, *end_match_2;
register unsigned char *d, *dend;
- register int mcnt; /* Multipurpose. */
- unsigned char *translate = (unsigned char *) pbufp->translate;
+ int mcnt, mcnt2; /* Multipurpose. */
+ unsigned char *translate = (unsigned char *) bufp->translate;
unsigned is_a_jump_n = 0;
+ /* This is how many registers the caller wants. */
+ unsigned num_regs_wanted = regs
+ ? bufp->return_default_num_regs
+ ? bufp->re_nsub + 1
+ : regs->num_regs
+ : 0;
+
+ /* Want to fill *all* the registers internally. */
+ unsigned num_internal_regs = bufp->re_nsub + 1;
+
+ void *destination; /* For REGEX_REALLOCATE. */
+
+
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
restart, regend, and reg_info for all registers corresponding to the
@@ -1946,13 +3678,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
``dummy''; if a failure happens and the failure point is a dummy, it
gets discarded and the next next one is tried. */
-#ifndef NO_ALLOCA
- unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
-#endif
- unsigned char **stackb;
- unsigned char **stackp;
- unsigned char **stacke;
-
+ failure_stack_type failure_stack;
/* Information on the contents of registers. These are pointers into
the input strings; they record just what was matched (on this
@@ -1962,8 +3688,21 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
- unsigned char *regstart[RE_NREGS];
- unsigned char *regend[RE_NREGS];
+ unsigned char **regstart = (unsigned char **)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+ unsigned char **regend = (unsigned char **)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+
+ /* If a group that's operated upon by a repetition operator fails to
+ match anything, then the register for its start will need to be
+ restored because it will have been set to wherever in the string we
+ are when we last see its open-group operator. The argument is
+ similar for a register's end. */
+
+ unsigned char **old_regstart
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+ unsigned char **old_regend
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
/* The is_active field of reg_info helps us keep track of which (possibly
nested) subexpressions we are currently in. The matched_something
@@ -1972,7 +3711,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
subexpression. These two fields get reset each time through any
loop their register is in. */
- struct register_info reg_info[RE_NREGS];
+ struct register_info *reg_info = (struct register_info *)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (struct register_info));
/* The following record the register info as found in the above
@@ -1981,36 +3721,92 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = 0;
- unsigned char *best_regstart[RE_NREGS];
- unsigned char *best_regend[RE_NREGS];
- /* Initialize the stack. */
-#ifdef NO_ALLOCA
- stackb = (unsigned char **) malloc (MAX_NUM_FAILURE_ITEMS * NFAILURES * sizeof (char *));
-#else
- stackb = initial_stack;
-#endif
- stackp = stackb;
- stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
+ unsigned char **best_regstart
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+
+ unsigned char **best_regend
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
-#ifdef DEBUG_REGEX
- fprintf (stderr, "Entering re_match_2(%s%s)\n", string1_arg, string2_arg);
+ unsigned current_reg = 0;
+
+ /* End of declarations. */
+
+
+ if (!INIT_FAILURE_STACK (failure_stack))
+ return -2;
+
+ if (!(regstart && regend && old_regstart && old_regend && reg_info
+ && best_regstart && best_regend))
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
#endif
+ return -2;
+ }
+ /* The starting position is bogus. */
+ if (pos < 0 || pos > size1 + size2)
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -1;
+ }
+
+
/* Initialize subexpression text positions to -1 to mark ones that no
\( or ( and \) or ) has been seen for. Also set all registers to
- inactive and mark them as not having matched anything or ever
- failed. */
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ inactive and mark them as not having any inner groups, able to
+ match the empty string, matched anything so far, or ever failed. */
+
+ for (mcnt = 0; mcnt < num_internal_regs; mcnt++)
{
- regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
+ regstart[mcnt] = regend[mcnt]
+ = old_regstart[mcnt] = old_regend[mcnt] = (unsigned char *) -1;
+
+ if (!INIT_BITS_LIST (INNER_GROUPS (reg_info[mcnt])))
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -2;
+ }
+
+ CAN_MATCH_NOTHING (reg_info[mcnt]) = -1; /* I.e., unset. */
+ /* The bit fields. */
IS_ACTIVE (reg_info[mcnt]) = 0;
MATCHED_SOMETHING (reg_info[mcnt]) = 0;
+ EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
}
- if (regs)
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
- regs->start[mcnt] = regs->end[mcnt] = -1;
+ IS_ACTIVE (reg_info[0]) = 1;
+
+
+ if (regs && num_regs_wanted > 0)
+ {
+ if (bufp->syntax & RE_ALLOCATE_REGISTERS)
+ {
+ regs->num_regs = num_regs_wanted;
+ regs->start = (int *) malloc (regs->num_regs * sizeof (int));
+
+ if (regs->start == NULL)
+ return -2;
+
+ regs->end = (int *) malloc (regs->num_regs * sizeof (int));
+
+ if (regs->end == NULL)
+ return -2;
+ }
+
+ for (mcnt = 0; mcnt < regs->num_regs; mcnt++)
+ {
+ regs->start[mcnt] = -1;
+ regs->end[mcnt] = -1;
+ }
+ }
+
+
/* Set up pointers to ends of strings.
Don't allow the second string to be empty unless both are empty. */
@@ -2024,16 +3820,17 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
end1 = string1 + size1;
end2 = string2 + size2;
+
/* Compute where to stop matching, within the two strings. */
- if (mstop <= size1)
+ if (stop <= size1)
{
- end_match_1 = string1 + mstop;
+ end_match_1 = string1 + stop;
end_match_2 = string2;
}
else
{
end_match_1 = end1;
- end_match_2 = string2 + mstop - size1;
+ end_match_2 = string2 + stop - size1;
}
/* `p' scans through the pattern as `d' scans through the data. `dend'
@@ -2041,12 +3838,18 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
advanced into the following input string whenever necessary, but
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
- equal string2. */
+ equal `string2'. */
if (size1 != 0 && pos <= size1)
- d = string1 + pos, dend = end_match_1;
+ {
+ d = string1 + pos;
+ dend = end_match_1;
+ }
else
- d = string2 + pos - size1, dend = end_match_2;
+ {
+ d = string2 + pos - size1;
+ dend = end_match_2;
+ }
/* This loops over pattern commands. It exits by returning from the
@@ -2055,12 +3858,6 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
while (1)
{
-#ifdef DEBUG_REGEX
- fprintf (stderr,
- "regex loop(%d): matching 0x%02d\n",
- p - (unsigned char *) pbufp->buffer,
- *p);
-#endif
is_a_jump_n = 0;
/* End of pattern means we might have succeeded. */
if (p == pend)
@@ -2068,7 +3865,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* If not end of string, try backtracking. Otherwise done. */
if (d != end_match_2)
{
- if (stackp != stackb)
+ if (!FAILURE_STACK_EMPTY)
{
/* More failure points to try. */
@@ -2084,7 +3881,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
best_regs_set = 1;
best_regend[0] = d; /* Never use regstart[0]. */
- for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+ for (mcnt = 1; mcnt < num_internal_regs; mcnt++)
{
best_regstart[mcnt] = regstart[mcnt];
best_regend[mcnt] = regend[mcnt];
@@ -2099,46 +3896,54 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Restore best match. */
d = best_regend[0];
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ if (d >= string1 && d <= end1)
+ dend = end_match_1;
+
+ for (mcnt = 0; mcnt < num_internal_regs; mcnt++)
{
regstart[mcnt] = best_regstart[mcnt];
regend[mcnt] = best_regend[mcnt];
}
}
- }
+ } /* if (d != end_match_2) */
/* If caller wants register contents data back, convert it
to indices. */
- if (regs)
+ if (regs && regs->num_regs > 0)
{
regs->start[0] = pos;
- if (MATCHING_IN_FIRST_STRING)
- regs->end[0] = d - string1;
- else
- regs->end[0] = d - string2 + size1;
- for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+
+ regs->end[0] = MATCHING_IN_FIRST_STRING
+ ? d - string1
+ : d - string2 + size1;
+
+ for (mcnt = 1; mcnt < regs->num_regs; mcnt++)
{
- if (regend[mcnt] == (unsigned char *) -1)
+ if (mcnt >= num_internal_regs
+ || regstart[mcnt] == (unsigned char *) -1
+ || regend[mcnt] == (unsigned char *) -1)
{
regs->start[mcnt] = -1;
regs->end[mcnt] = -1;
continue;
}
- if (IS_IN_FIRST_STRING (regstart[mcnt]))
- regs->start[mcnt] = regstart[mcnt] - string1;
- else
- regs->start[mcnt] = regstart[mcnt] - string2 + size1;
+
+ regs->start[mcnt] = IS_IN_FIRST_STRING (regstart[mcnt])
+ ? regstart[mcnt] - string1
+ : regstart[mcnt] - string2 + size1;
- if (IS_IN_FIRST_STRING (regend[mcnt]))
- regs->end[mcnt] = regend[mcnt] - string1;
- else
- regs->end[mcnt] = regend[mcnt] - string2 + size1;
+ regs->end[mcnt] = IS_IN_FIRST_STRING (regend[mcnt])
+ ? regend[mcnt] - string1
+ : regend[mcnt] - string2 + size1;
}
}
- FREE_AND_RETURN(stackb,
- (d - pos - (MATCHING_IN_FIRST_STRING ?
- string1 :
- string2 - size1)));
+
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return d - pos - (MATCHING_IN_FIRST_STRING
+ ? string1
+ : string2 - size1);
}
/* Otherwise match next pattern command. */
@@ -2150,51 +3955,135 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
/* \( [or `(', as appropriate] is represented by start_memory,
- \) by stop_memory. Both of those commands are followed by
- a register number in the next byte. The text matched
- within the \( and \) is recorded under that number. */
- case start_memory:
+ \) by stop_memory. Both of those commands are followed by a
+ register number in the next byte. The text matched within
+ the \( and \) is recorded (in the internal registers data
+ structure) under that number. */
+
+ case start_memory:
+ /* Find out if this group can match the empty string. */
+ p1 = p; /* To send to group_can_match_nothing. */
+
+ if (CAN_MATCH_NOTHING (reg_info[*p]) == -1)
+ CAN_MATCH_NOTHING (reg_info[*p])
+ = group_can_match_nothing (&p1, pend, reg_info);
+
+ /* Save the position in the string where we were the last time
+ we were at this open-group operator in case the group is
+ operated upon by a repetition operator, e.g., with `(a*)*b'
+ against `ab'; then we want to ignore where we are now in
+ the string in case this attempt to match fails. */
+
+ old_regstart[*p] = CAN_MATCH_NOTHING (reg_info[*p])
+ ? ((regstart[*p] == (unsigned char *) -1)
+ ? d : regstart[*p])
+ : regstart[*p];
regstart[*p] = d;
+
IS_ACTIVE (reg_info[*p]) = 1;
MATCHED_SOMETHING (reg_info[*p]) = 0;
p++;
break;
case stop_memory:
+ /* Save the position we were in the string the last time we
+ were at this close-group operator in case the group is
+ operated upon by a repetition operator, e.g., with
+ `((a*)*(b*)*)*' against `aba'; then we want to ignore where
+ we are now in the string in case this attempt to match
+ fails. */
+
+ old_regend[*p] = CAN_MATCH_NOTHING (reg_info[*p])
+ ? ((regend[*p] == (unsigned char *) -1)
+ ? d : regend[*p])
+ : regend[*p];
regend[*p] = d;
IS_ACTIVE (reg_info[*p]) = 0;
-
- /* If just failed to match something this time around with a sub-
- expression that's in a loop, try to force exit from the loop. */
- if ((! MATCHED_SOMETHING (reg_info[*p])
- || (enum regexpcode) p[-3] == start_memory)
+
+ /* Record that this group is inside of all currently active
+ groups; makes no sense for group 1. */
+ if (*p != 1)
+ NOTE_INNER_GROUP (*p);
+
+
+ /* If just failed to match something this time around with a
+ group that's operated on by a repetition operator, try to
+ force exit from the ``loop,'' and restore the register
+ information for this group that we had before trying this
+ last match. */
+
+ if ((!MATCHED_SOMETHING (reg_info[*p])
+ || (enum regexpcode) p[-3] == start_memory)
&& (p + 1) != pend)
{
- register unsigned char *p2 = p + 1;
+ p1 = p + 1;
mcnt = 0;
- switch (*p2++)
+ switch ((enum regexcode) *p1++)
{
- case jump_n:
+ case no_pop_jump_n:
is_a_jump_n = 1;
- case finalize_jump:
- case maybe_finalize_jump:
- case jump:
+ case pop_failure_jump:
+ case maybe_pop_jump:
+ case no_pop_jump:
case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p2);
+ extract_number_and_incr (&mcnt, &p1);
if (is_a_jump_n)
- p2 += 2;
+ p1 += 2;
break;
}
- p2 += mcnt;
+ p1 += mcnt;
/* If the next operation is a jump backwards in the pattern
- to an on_failure_jump, exit from the loop by forcing a
- failure after pushing on the stack the on_failure_jump's
- jump in the pattern, and d. */
- if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
+ to an on_failure_jump right before the start_memory
+ corresponding to this stop_memory, exit from the loop
+ by forcing a failure after pushing on the stack the
+ on_failure_jump's jump in the pattern, and d. */
+
+ if (mcnt < 0 && (enum regexpcode) *p1 == on_failure_jump
+ && (enum regexpcode) p1[3] == start_memory && p1[4] == *p)
{
- EXTRACT_NUMBER_AND_INCR (mcnt, p2);
- PUSH_FAILURE_POINT (p2 + mcnt, d);
+ /* If this group ever matched anything, then
+ restore what its registers were before trying
+ this last failed match, e.g., with `(a*)*b' against
+ `ab' for regstart[1], and, e.g., with `((a*)*(b*)*)*'
+ against `aba' for regend[3].
+
+ Restore the registers for inner groups, too, e.g.,
+ for `((a*)(b*))*' against `aba' (register 2 gets
+ trashed). */
+
+ if (EVER_MATCHED_SOMETHING (reg_info[*p]))
+ {
+ unsigned this_reg;
+ unsigned bits_mask;
+
+ EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
+
+ /* Restore this group's registers. */
+
+ regstart[*p] = old_regstart[*p];
+ regend[*p] = old_regend[*p];
+
+ /* Restore the inner groups' (if any) registers. */
+
+ for (this_reg = 0;
+ this_reg < INNER_GROUPS (reg_info[*p]).size;
+ this_reg++)
+ {
+ if (get_bit (INNER_GROUPS (reg_info[*p]), this_reg))
+ {
+ regstart[this_reg] = old_regstart[this_reg];
+
+ if ((int)old_regend[this_reg]
+ >= (int)regstart[this_reg])
+ regend[this_reg] = old_regend[this_reg];
+ }
+ }
+ }
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ PUSH_FAILURE_POINT (p1 + mcnt, d, failure_stack, -2);
+
goto fail;
}
}
@@ -2205,10 +4094,16 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
followed by the numeric value of <digit> as the register number. */
case duplicate:
{
- int regno = *p++; /* Get which register to match against */
register unsigned char *d2, *dend2;
-
- /* Where in input to try to start matching. */
+ int regno = *p++; /* Get which register to match against. */
+
+ /* Can't back reference a group which we've never matched. */
+ if ((regstart[regno] == (unsigned char *) -1
+ || regend[regno] == (unsigned char *) -1)
+ && ! bufp->can_be_null)
+ goto really_fail;
+
+ /* Where in input to try to start matching. */
d2 = regstart[regno];
/* Where to stop matching; if both the place to start and
@@ -2227,7 +4122,10 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
if (dend2 == end_match_2) break;
if (dend2 == regend[regno]) break;
- d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
+
+ /* end of string1 => advance to string2. */
+ d2 = string2;
+ dend2 = regend[regno];
}
/* At end of register contents => success */
if (d2 == dend2) break;
@@ -2246,8 +4144,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Compare that many; failure if mismatch, else move
past them. */
if (translate
- ? memcmp_translate (d, d2, mcnt, translate)
- : memcmp ((char *)d, (char *)d2, mcnt))
+ ? bcmp_translate (d, d2, mcnt, translate)
+ : bcmp (d, d2, mcnt))
goto fail;
d += mcnt, d2 += mcnt;
}
@@ -2256,12 +4154,14 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
case anychar:
PREFETCH; /* Fetch a data character. */
- /* Match anything but a newline, maybe even a null. */
- if ((translate ? translate[*d] : *d) == '\n'
- || ((obscure_syntax & RE_DOT_NOT_NULL)
+ /* Match anything but possibly a newline or a null. */
+ if ((!(bufp->syntax & RE_DOT_NEWLINE)
+ && (translate ? translate[*d] : *d) == '\n')
+ || ((bufp->syntax & RE_DOT_NOT_NULL)
&& (translate ? translate[*d] : *d) == '\000'))
goto fail;
- SET_REGS_MATCHED;
+
+ SET_REGS_MATCHED;
d++;
break;
@@ -2275,10 +4175,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
PREFETCH; /* Fetch a data character. */
- if (translate)
- c = translate[*d];
- else
- c = *d;
+ c = translate ? translate[*d] : *d;
if (c < *p * BYTEWIDTH
&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
@@ -2293,66 +4190,105 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
case begline:
+ if (bufp->not_bol == 1)
+ goto fail;
+
+ if (d && (*d == '\n' || d[-1] == '\n'))
+ {
+ if (*d == '\n')
+ d++;
+
+ if (bufp->syntax & RE_NO_ANCHOR_AT_NEWLINE)
+ goto fail;
+ else
+ break;
+ }
+
if ((size1 != 0 && d == string1)
|| (size1 == 0 && size2 != 0 && d == string2)
- || (d && d[-1] == '\n')
|| (size1 == 0 && size2 == 0))
break;
else
goto fail;
case endline:
+ if (bufp->not_eol == 1)
+ goto fail;
+
if (d == end2
- || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
+ || (d == end1 && size2 == 0))
break;
- goto fail;
- /* `or' constructs are handled by starting each alternative with
- an on_failure_jump that points to the start of the next
- alternative. Each alternative except the last ends with a
- jump to the joining point. (Actually, each jump except for
- the last one really jumps to the following jump, because
- tensioning the jumps is a hassle.) */
+ if (*d == '\n' || (d == end1 && *string2 == '\n'))
+ {
+ PREFETCH;
+
+ if (*d == '\n')
+ d++;
+
+ if (bufp->syntax & RE_NO_ANCHOR_AT_NEWLINE)
+ goto fail;
+ else
+ break;
+ }
+ goto fail;
- /* The start of a stupid repeat has an on_failure_jump that points
- past the end of the repeat text. This makes a failure point so
- that on failure to match a repetition, matching restarts past
- as many repetitions have been found with no way to fail and
- look for another one. */
+ /* Uses of on_failure_jump:
+
+ Each alternative starts with an on_failure_jump that points
+ to the beginning of the next alternative. Each alternative
+ except the last ends with a jump that in effect jumps past
+ the rest of the alternatives. (They really jump to the
+ ending jump of the following alternative, because tensioning
+ these jumps is a hassle.)
- /* A smart repeat is similar but loops back to the on_failure_jump
- so that each repetition makes another failure point. */
+ Repeats start with an on_failure_jump that points past both
+ the repetition text and the following jump or
+ pop_failure_jump back to this on_failure_jump. */
case on_failure_jump:
on_failure:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- PUSH_FAILURE_POINT (p + mcnt, d);
+ extract_number_and_incr (&mcnt, &p);
+ PUSH_FAILURE_POINT (p + mcnt, d, failure_stack, -2);
+
break;
- /* The end of a smart repeat has a maybe_finalize_jump back.
- Change it either to a finalize_jump or an ordinary jump. */
- case maybe_finalize_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+
+ /* A smart repeat ends with a maybe_pop_jump.
+ We change it either to a pop_failure_jump or a no_pop_jump. */
+
+ case maybe_pop_jump:
+ extract_number_and_incr (&mcnt, &p);
{
register unsigned char *p2 = p;
- /* Compare what follows with the beginning of the repeat.
- If we can establish that there is nothing that they would
- both match, we can change to finalize_jump. */
+
+ /* Compare the beginning of the repeat with what in the
+ pattern follows its end. If we can establish that there
+ is nothing that they would both match, i.e., that we
+ would have to backtrack because of (as would in, e.g.,
+ `a*a') then we can change to pop_failure_jump, because
+ we'll never have to backtrack. */
+
+ /* Skip over parentheses. */
while (p2 + 1 != pend
&& (*p2 == (unsigned char) stop_memory
|| *p2 == (unsigned char) start_memory))
- p2 += 2; /* Skip over reg number. */
- if (p2 == pend)
- p[-3] = (unsigned char) finalize_jump;
- else if (*p2 == (unsigned char) exactn
+ p2 += 2; /* Skip over reg number, too. */
+
+ if (p2 == pend)
+ p[-3] = (unsigned char) pop_failure_jump;
+ else if (*p2 == (unsigned char) exactn
|| *p2 == (unsigned char) endline)
{
register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
register unsigned char *p1 = p + mcnt;
- /* p1[0] ... p1[2] are an on_failure_jump.
- Examine what follows that. */
- if (p1[3] == (unsigned char) exactn && p1[5] != c)
- p[-3] = (unsigned char) finalize_jump;
+
+ /* p1[0] ... p1[2] are the on_failure_jump corresponding
+ to the maybe_finalize_jump of this case. Examine what
+ follows it. */
+
+ if (p1[3] == (unsigned char) exactn && p1[5] != c)
+ p[-3] = (unsigned char) pop_failure_jump;
else if (p1[3] == (unsigned char) charset
|| p1[3] == (unsigned char) charset_not)
{
@@ -2360,53 +4296,83 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
if (c < p1[4] * BYTEWIDTH
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
- /* `not' is 1 if c would match. */
- /* That means it is not safe to finalize. */
+ /* `not' is equal to 1 if c would match, which means
+ that we can't change to pop_failure_jump. */
if (!not)
- p[-3] = (unsigned char) finalize_jump;
+ p[-3] = (unsigned char) pop_failure_jump;
}
}
}
p -= 2; /* Point at relative address again. */
- if (p[-1] != (unsigned char) finalize_jump)
+ if (p[-1] != (unsigned char) pop_failure_jump)
{
- p[-1] = (unsigned char) jump;
- goto nofinalize;
+ p[-1] = (unsigned char) no_pop_jump;
+ goto no_pop;
}
/* Note fall through. */
- /* The end of a stupid repeat has a finalize_jump back to the
- start, where another failure point will be made which will
- point to after all the repetitions found so far. */
+ /* The end of a simple repeat has a pop_failure_jump back to
+ its matching on_failure_jump, where the latter will push a
+ failure point point. The pop_failure_jump takes off failure
+ points put on by this pop_failure_jump's matching
+ on_failure_jump; we got through the pattern to here from the
+ matching on_failure_jump, so didn't fail. Also remove the
+ register information put on by the matching on_failure_jump. */
+
+ case pop_failure_jump:
+ pop:
+ pop_failure_point (&failure_stack);
+ /* Note fall through. */
+
+ /* Jump without taking off any failure points. */
- /* Take off failure points put on by matching on_failure_jump
- because didn't fail. Also remove the register information
- put on by the on_failure_jump. */
- case finalize_jump:
- POP_FAILURE_POINT ();
- /* Note fall through. */
-
- /* Jump without taking off any failure points. */
- case jump:
- nofinalize:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- p += mcnt;
+ case no_pop_jump:
+ no_pop:
+ extract_number_and_incr (&mcnt, &p); /* Get the amount to jump. */
+ p += mcnt; /* Do the jump. */
break;
+
+ /* If the last alternative didn't match anything and empty
+ alternatives aren't allowed, then don't skip over the next
+ one. */
+
+ case jump_past_next_alt:
+ {
+ int this_reg; /* Counting down. */
+
+ /* The current register is the innermost (the one with the
+ highest number) active one. */
+
+ for (this_reg = num_internal_regs - 1;
+ this_reg >= 0; this_reg--)
+ if (IS_ACTIVE (reg_info[this_reg]))
+ break;
+
+ if (!(bufp->syntax & RE_NO_EMPTY_ALTS)
+ || MATCHED_SOMETHING (reg_info[this_reg]))
+ goto no_pop;
+
+ p += 2; /* Skip past the jump's number. */
+ break;
+ }
+
case dummy_failure_jump:
/* Normally, the on_failure_jump pushes a failure point, which
- then gets popped at finalize_jump. We will end up at
- finalize_jump, also, and with a pattern of, say, `a+', we
+ then gets popped at pop_failure_jump. We will end up at
+ pop_failure_jump, also, and with a pattern of, say, `a+', we
are skipping over the on_failure_jump, so we have to push
- something meaningless for finalize_jump to pop. */
- PUSH_FAILURE_POINT (0, 0);
- goto nofinalize;
+ something meaningless for pop_failure_jump to pop. */
+
+ PUSH_FAILURE_POINT (0, 0, failure_stack, -2);
+
+ goto no_pop;
/* Have to succeed matching what follows at least n times. Then
just handle like an on_failure_jump. */
case succeed_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ mcnt = extract_number (p + 2);
/* Originally, this is how many times we HAVE to succeed. */
if (mcnt)
{
@@ -2416,8 +4382,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
else if (mcnt == 0)
{
- p[2] = unused;
- p[3] = unused;
+ p[2] = (char) no_op;
+ p[3] = (char) no_op;
goto on_failure;
}
else
@@ -2427,15 +4393,14 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
break;
- case jump_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ case no_pop_jump_n:
+ mcnt = extract_number (p + 2);
/* Originally, this is how many times we CAN jump. */
if (mcnt)
{
mcnt--;
STORE_NUMBER(p + 2, mcnt);
- goto nofinalize; /* Do the jump without taking off
- any failure points. */
+ goto no_pop;
}
/* If don't have to jump any more, skip over the rest of command. */
else
@@ -2446,16 +4411,16 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
register unsigned char *p1;
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ extract_number_and_incr (&mcnt, &p);
STORE_NUMBER (p1, mcnt);
break;
}
/* Ignore these. Used to ignore the n of succeed_n's which
currently have n == 0. */
- case unused:
+ case no_op:
break;
case wordbound:
@@ -2469,32 +4434,56 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
break;
case wordbeg:
- if (IS_A_LETTER (d) && (!IS_A_LETTER (d - 1) || AT_STRINGS_BEG))
+ /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
+ if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1)))
break;
goto fail;
case wordend:
/* Have to check if AT_STRINGS_BEG before looking at d - 1. */
- if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
+ if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
&& (!IS_A_LETTER (d) || AT_STRINGS_END))
break;
goto fail;
#ifdef emacs
- case before_dot:
- if (PTR_CHAR_POS (d) >= point)
+#ifdef emacs19
+ case before_dot:
+ if (PTR_CHAR_POS (d) >= point)
+ goto fail;
+ break;
+
+ case at_dot:
+ if (PTR_CHAR_POS (d) != point)
+ goto fail;
+ break;
+
+ case after_dot:
+ if (PTR_CHAR_POS (d) <= point)
+ goto fail;
+ break;
+#else /* not emacs19 */
+ case before_dot:
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ <= point)
goto fail;
break;
case at_dot:
- if (PTR_CHAR_POS (d) != point)
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ == point)
goto fail;
break;
case after_dot:
- if (PTR_CHAR_POS (d) <= point)
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ >= point)
goto fail;
break;
+#endif /* not emacs19 */
case wordchar:
mcnt = (int) Sword;
@@ -2579,28 +4568,64 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Jump here if any matching operation fails. */
fail:
- if (stackp != stackb)
+ if (!FAILURE_STACK_EMPTY)
/* A restart point is known. Restart there and pop it. */
{
- short last_used_reg, this_reg;
-
- /* If this failure point is from a dummy_failure_point, just
- skip it. */
- if (!stackp[-2])
+ short highest_used_reg, this_reg;
+ boolean is_a_jump_n = false;
+
+ /* If this failure point is from a dummy_failure_point,
+ just skip it. */
+
+ if (!failure_stack.stack[failure_stack.avail - 2])
{
- POP_FAILURE_POINT ();
+ pop_failure_point (&failure_stack);
goto fail;
}
- d = *--stackp;
- p = *--stackp;
+ /* Among other things, undo the last failure point push. */
+
+ d = failure_stack.stack[--failure_stack.avail];
+ p = failure_stack.stack[--failure_stack.avail];
+
+
+ /* If failed to a backwards jump that's part of a repetition
+ loop, need to pop this failure point and use the next one. */
+
+ switch ((enum regexpcode) *p)
+ {
+ case no_pop_jump_n:
+ is_a_jump_n = true;
+ case maybe_pop_jump:
+ case pop_failure_jump:
+ case no_pop_jump:
+ p1 = p + 1;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+
+ if ((is_a_jump_n && *p1 == succeed_n)
+ || (!is_a_jump_n && *p1 == on_failure_jump))
+ {
+ /* Put p and d back on the stack again... */
+ failure_stack.avail += 2;
+
+ /* ...and pop the whole failure point. */
+ pop_failure_point (&failure_stack);
+ goto fail;
+ }
+ break;
+ }
+
if (d >= string1 && d <= end1)
dend = end_match_1;
+
/* Restore register info. */
- last_used_reg = (short) *--stackp;
+ highest_used_reg
+ = (short) failure_stack.stack[--failure_stack.avail];
/* Make the ones that weren't saved -1 or 0 again. */
- for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
+ for (this_reg = num_internal_regs - 1; this_reg > highest_used_reg;
+ this_reg--)
{
regend[this_reg] = (unsigned char *) -1;
regstart[this_reg] = (unsigned char *) -1;
@@ -2611,24 +4636,342 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* And restore the rest from the stack. */
for ( ; this_reg > 0; this_reg--)
{
- reg_info[this_reg] = *(struct register_info *) *--stackp;
- regend[this_reg] = *--stackp;
- regstart[this_reg] = *--stackp;
+ reg_info[this_reg] = *(struct register_info *)
+ failure_stack.stack[--failure_stack.avail];
+
+ regend[this_reg]
+ = failure_stack.stack[--failure_stack.avail];
+
+ regstart[this_reg]
+ = failure_stack.stack[--failure_stack.avail];
}
- }
+ }
else
break; /* Matching at this starting point really fails. */
- }
+ } /* while (1) */
+ really_fail:
if (best_regs_set)
goto restore_best_regs;
- FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -1; /* Failure to match. */
+}
+
+
+
+
+/* Subroutine definitions for re_match_2. */
+
+
+
+/* Failure stack stuff. */
+
+/* Pops what PUSH_FAILURE_STACK pushes. */
+
+static void
+pop_failure_point(failure_stack_ptr)
+ failure_stack_type *failure_stack_ptr;
+{
+ int temp;
+
+ if (FAILURE_STACK_PTR_EMPTY)
+ {
+ printf ("Tried to pop empty failure point in re_match_2.\n");
+ exit (1);
+ }
+
+ /* Remove failure points and point to how many regs pushed. */
+ else
+ {
+ if (failure_stack_ptr->avail < 3)
+ {
+ printf ("Aren't enough items to pop on re_match_2 failure stack: \
+there's only %d on it.\n", failure_stack_ptr->avail);
+ exit (1);
+ }
+ failure_stack_ptr->avail -= 3;
+ temp = (int) failure_stack_ptr->stack[failure_stack_ptr->avail];
+ temp *= NUM_REG_ITEMS; /* How much to take off the stack. */
+
+ if (failure_stack_ptr->avail < temp)
+ {
+ printf ("Can't pop %d items off re_match_2 failure stack: \
+there's only %d on it.\n", temp, failure_stack_ptr->avail);
+ exit (1);
+ }
+ failure_stack_ptr->avail -= temp; /* Remove the register info. */
+ }
+}
+
+
+/* Other things. */
+
+static boolean common_op_can_match_nothing ();
+static boolean alternative_can_match_nothing ();
+
+
+/* We are given P pointing to a register number after a start_memory.
+
+ Return true if the pattern up to the corresponding stop_memory can
+ match the empty string, and false otherwise.
+
+ If we find the matching stop_memory, sets P to point to one past its number.
+ Otherwise, sets P to an undefined byte less than or equal to END.
+
+ We don't handle duplicates properly (yet). */
+
+static boolean
+group_can_match_nothing (p, end, reg_info)
+ unsigned char **p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = *p + 1; /* Point to after this register number. */
+
+ while (p1 < end)
+ {
+ /* Skip over opcodes that can match nothing, and return true or
+ false, as appropriate, when we get to one that can't, or to the
+ matching stop_memory. */
+
+ switch ((enum regexpcode) *p1)
+ {
+ /* Could be either a loop or a series of alternatives. */
+ case on_failure_jump:
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+
+ /* If the next operation is not a jump backwards in the
+ pattern. */
+
+ if (mcnt >= 0)
+ {
+ /* Go through the on_failure_jumps of the alternatives,
+ seeing if any of the alternatives cannot match nothing.
+ The last alternative starts with only a no_pop_jump,
+ whereas the rest start with on_failure_jump and end
+ with a no_pop_jump, e.g., here is the pattern for `a|b|c':
+
+ /on_failure_jump/0/6/exactn/1/a/jump_past_next_alt/0/6
+ /on_failure_jump/0/6/exactn/1/b/jump_past_next_alt/0/3
+ /exactn/1/c
+
+ So, we have to first go through the first (n-1)
+ alternatives and then deal with the last one separately. */
+
+
+ /* Deal with the first (n-1) alternatives, which start
+ with an on_failure_jump (see above) that jumps to right
+ past a jump_past_next_alt. */
+
+ while ((enum regexpcode) p1[mcnt-3] == jump_past_next_alt)
+ {
+ /* MCNT holds how many bytes long the alternative
+ is, including the ending `jump_past_next_alt' and its number. */
+
+ if (!alternative_can_match_nothing (p1, p1 + mcnt - 3,
+ reg_info))
+ return false;
+
+ /* Move to right after this alternative, including the
+ jump_past_next_alt. */
+
+ p1 += mcnt;
+
+ /* Break if it's the beginning of an n-th alternative
+ that doesn't begin with an on_failure_jump. */
+
+ if ((enum regexpcode) *p1 != on_failure_jump)
+ break;
+
+ /* Still have to check that it's not an n-th
+ alternative that starts with an on_failure_jump. */
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ if ((enum regexpcode) p1[mcnt-3] != jump_past_next_alt)
+ {
+ /* Get to the beginning of the n-th alternative. */
+ p1 -= 3;
+ break;
+ }
+ }
+
+ /* Deal with the last alternative: go back and get number
+ of the jump_past_next_alt just before it. MCNT contains how
+ many bytes long the alternative is. */
+
+ mcnt = extract_number (p1 - 2);
+
+ if (!alternative_can_match_nothing (p1, p1 + mcnt, reg_info))
+ return false;
+
+ p1 += mcnt; /* Get past the n-th alternative. */
+
+ } /* if mcnt > 0 */
+
+ break;
+
+ case stop_memory:
+ if (p1[1] == **p)
+ {
+ *p = p1 + 2;
+ return true;
+ }
+ else
+ {
+ printf ("Error: encountered an unmatched (%d) stop_memory in \
+group_can_match_nothing.\n", **p);
+ exit (1);
+ }
+ break;
+
+ default:
+ if (!common_op_can_match_nothing (&p1, end, reg_info))
+ return false;
+ }
+ } /* While p1 < end. */
+
+ return false;
+}
+
+
+/* Similar to group_can_match_nothing, but doesn't deal with alternatives:
+ It expects P to be the first byte of a single alternative and END one
+ byte past the last. The alternative can contain groups. */
+
+
+static boolean
+alternative_can_match_nothing (p, end, reg_info)
+ unsigned char *p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = p;
+
+ while (p1 < end)
+ {
+ /* Skip over opcodes that can match nothing, and break when we get
+ to one that can't. */
+
+ switch ((enum regexpcode) *p1)
+ {
+ /* It's a loop. */
+ case on_failure_jump:
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+ break;
+
+ default:
+ if (!common_op_can_match_nothing (&p1, end, reg_info))
+ return false;
+ }
+ } /* While not at the end of the alternative. */
+
+ return true;
+}
+
+
+/* Deals with the ops common to group_can_match_nothing and
+ alternative_can_match_nothing.
+
+ Sets P to one after the op and its arguments, if any. */
+
+static boolean
+common_op_can_match_nothing (p, end, reg_info)
+ unsigned char **p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = *p;
+ boolean ret;
+ int reg_no;
+
+ switch ((enum regexp1code) *p1++)
+ {
+ case no_op:
+ case begline:
+ case endline:
+ case endline_in_repeat:
+ case endline_before_newline:
+ break;
+
+ case start_memory:
+ reg_no = *p1;
+ ret = group_can_match_nothing (&p1, end, reg_info);
+
+ /* Have to set this here in case we're checking a group which
+ contains a group and a back reference to it. */
+
+ if (CAN_MATCH_NOTHING (reg_info[reg_no]) == -1)
+ CAN_MATCH_NOTHING (reg_info[reg_no]) = ret;
+
+ if (!ret)
+ return false;
+ break;
+
+ /* If this is an optimized succeed_n for zero times, make the jump. */
+ case no_pop_jump:
+ extract_number_and_incr (&mcnt, &p1);
+
+ if (mcnt >= 0)
+ p1 += mcnt;
+ else
+ return false;
+ break;
+
+ case succeed_n:
+ /* Get to the number of times to succeed. */
+ p1 += 2;
+ extract_number_and_incr (&mcnt, &p1);
+
+ if (mcnt == 0)
+ {
+ p1 -= 4;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+ }
+ else
+ return false;
+ break;
+
+ case duplicate:
+ if (!CAN_MATCH_NOTHING (reg_info[*p1]))
+ return false;
+ break;
+
+ case set_number_at:
+ p1 += 4;
+ case before_dot:
+ case at_dot:
+ case after_dot:
+ case begbuf:
+ case endbuf:
+ case wordbeg:
+ case wordend:
+ case wordbound:
+ case notwordbound:
+ break;
+
+ default:
+ /* All other opcodes mean we cannot match the empty string. */
+ return false;
+ }
+
+ *p = p1;
+ return true;
}
+
+/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
+ bytes; nonzero otherwise. */
+
static int
-memcmp_translate (s1, s2, len, translate)
+bcmp_translate (s1, s2, len, translate)
unsigned char *s1, *s2;
register int len;
unsigned char *translate;
@@ -2643,17 +4986,21 @@ memcmp_translate (s1, s2, len, translate)
}
+
-/* Entry points compatible with 4.2 BSD regex library. */
+/* Entry points compatible with 4.2 BSD regex library. We don't define
+ them if this is an Emacs or POSIX compilation. */
-#ifndef emacs
+#if !defined(GAWK) && !defined (emacs) && !defined (_POSIX_SOURCE)
static struct re_pattern_buffer re_comp_buf;
char *
re_comp (s)
- char *s;
+ const char *s;
{
+ char *return_value;
+
if (!s)
{
if (!re_comp_buf.buffer)
@@ -2663,32 +5010,403 @@ re_comp (s)
if (!re_comp_buf.buffer)
{
- if (!(re_comp_buf.buffer = (char *) malloc (200)))
- return "Memory exhausted";
+ re_comp_buf.buffer = (char *) malloc (200);
+
+ if (re_comp_buf.buffer == NULL)
+ return "Memory exhausted";
+
re_comp_buf.allocated = 200;
- if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
+
+ re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
+
+ if (re_comp_buf.fastmap == NULL)
return "Memory exhausted";
}
- return re_compile_pattern (s, strlen (s), &re_comp_buf);
+ return regex_compile (s, strlen (s), obscure_syntax, &re_comp_buf);
}
int
re_exec (s)
- char *s;
+ const char *s;
{
- int len = strlen (s);
- return 0 <= re_search (&re_comp_buf, s, len, 0, len,
- (struct re_registers *) 0);
+ const int len = strlen (s);
+ return 0 <= re_search (&re_comp_buf, s, len, 0, len,
+ (struct re_registers *) 0);
}
-#endif /* not emacs */
+
+#endif /* not emacs and not _POSIX_SOURCE */
+
+
+
+/* Entry points compatible with POSIX regex library. Only define these
+ when this is a POSIX compilation (and it's not Emacs). */
+
+#if !defined(emacs) && !defined(GAWK)
+
+/* regcomp takes a regular-expression string and converts it into a
+ buffer full of byte commands for matching.
+
+ PREG is a regex_t * whose pertinent fields are mentioned in below:
+
+ It has a char * field called BUFFER which points to the
+ space where this routine will put the compiled pattern; the
+ user can either allocate this using malloc (whereupon they
+ should set the long field ALLOCATED to the number of bytes
+ malloced) or set ALLOCATED to 0 and let the routine
+ allocate it. The routine may use realloc to enlarge the
+ buffer space.
+
+ If the user wants to translate all ordinary elements in the
+ compiled pattern, they should set the char * field
+ TRANSLATE to a translate table (and not set the REG_ICASE
+ bit of CFLAGS, which would override this translate table
+ with one that ignores case); otherwise, they should set
+ TRANSLATE to 0.
+
+ The routine sets the int field SYNTAX to RE_SYNTAX_POSIX_EXTENDED
+ if the REG_EXTENDED bit in CFLAGS is set; otherwise, it sets it
+ to RE_SYNTAX_POSIX_BASIC.
+
+ It returns in the long field USED how many bytes long the
+ compiled pattern is.
+
+ It returns 0 in the char field FASTMAP_ACCURATE, on
+ the assumption that the user usually doesn't compile the
+ same pattern twice and that consequently any fastmap in the
+ pattern buffer is inaccurate.
+
+ In the size_t field RE_NSUB, it returns the number of
+ subexpressions it found in PATTERN.
+
+ PATTERN is the address of the pattern string.
+
+ CFLAGS is a series of bits ORed together which affect compilation.
+ If the bit REG_EXTENDED is set, regcomp compiles the
+ pattern as an extended regular expression, otherwise it
+ compiles it as a basic one. If the bit REG_NEWLINE is set,
+ then dot and nonmatching lists won't match a newline, but
+ pattern anchors will match at them. If the bit REG_ICASE
+ is set, then it considers upper- and lowercase versions of
+ letters to be equal when matching. If the bit REG_NOSUB is
+ set, then when PREG is passed to regexec, that routine will
+ only report success or failure.
+
+
+ It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
+ POSIX return codes and their meanings.) */
+
+
+int
+regcomp (preg, pattern, cflags)
+ regex_t *preg;
+ const char *pattern;
+ int cflags;
+{
+ char *return_value;
+
+ int syntax = cflags & REG_EXTENDED
+ ? RE_SYNTAX_POSIX_EXTENDED
+ : RE_SYNTAX_POSIX_BASIC;
+
+ if (cflags & REG_NEWLINE)
+ {
+ syntax &= ~RE_DOT_NEWLINE;
+ syntax |= RE_HAT_LISTS_NOT_NEWLINE;
+ syntax &= ~RE_NO_ANCHOR_AT_NEWLINE;
+ }
+
+ if (cflags & REG_ICASE)
+ {
+ unsigned i;
+
+ preg->translate = (char *) malloc (CHAR_SET_SIZE);
+
+ if (preg->translate == NULL)
+ return REG_ESPACE;
+
+ /* Map any uppercase characters into corresponding lowercase ones. */
+ for (i = 0; i < CHAR_SET_SIZE; i++)
+ preg->translate[i] = isupper (i) ? tolower (i) : i;
+ }
+ else
+ preg->translate = 0;
+
+ preg->no_sub = cflags & REG_NOSUB;
+
+ return_value = regex_compile (pattern, strlen (pattern), syntax, preg);
+
+
+ if (return_value == 0)
+ return 0;
+ else if (strcmp (return_value, "Invalid regular expression") == 0)
+ return REG_BADPAT;
+ else if (strcmp (return_value, "Invalid character class name") == 0)
+ return REG_ECTYPE;
+ else if (strcmp (return_value, "Trailing backslash") == 0)
+ return REG_EESCAPE;
+ else if (strcmp (return_value, "Invalid back reference") == 0)
+ return REG_ESUBREG;
+ else if (strcmp (return_value, "Unmatched [ or [^") == 0)
+ return REG_EBRACK;
+ else if (strcmp (return_value, "Unmatched ( or \\(") == 0
+ || strcmp (return_value, "Unmatched ) or \\)") == 0)
+ return REG_EPAREN;
+ else if (strcmp (return_value, "Unmatched \\{") == 0)
+ return REG_EBRACE;
+ else if (strcmp (return_value, "Invalid content of \\{\\}") == 0)
+ return REG_BADBR;
+ else if (strcmp (return_value, "Invalid range end") == 0)
+ return REG_ERANGE;
+ else if (strcmp (return_value, "Memory exhausted") == 0)
+ return REG_ESPACE;
+ else if (strcmp (return_value, "Invalid preceding regular expression") == 0
+ || strcmp (return_value,
+ "Missing preceding regular expression") == 0)
+ return REG_BADRPT;
+
+ /* Codes added by GNU. */
+
+ else if (strcmp (return_value, "Premature end of regular expression") == 0)
+ return REG_EEND;
+ else if (strcmp (return_value, "Regular expression too big") == 0)
+ return REG_ESIZE;
+else
+ return REG_BADPAT;
+}
+
+
+/* regexex matches a buffer full of byte commands for matching (gotten
+ from compiling a regular expression) and matches it against a string.
+
+ PREG is a regex_t * whose pertinent fields are mentioned below:
+
+ It has a char * field called BUFFER which points to
+ the byte commands which make up the compiled pattern.
+
+ Its char * field TRANSLATE, if not 0, translates all
+ ordinary elements in the compiled pattern.
+
+ Its int field SYNTAX is the syntax with which the pattern
+ was compiled and hence should be matched with.
+
+ The long field USED is how many bytes long the compiled
+ pattern is.
+
+ Its size_t field RE_NSUB contains how many subexpressions
+ the pattern has. (This may be useful for choosing a value
+ for NMATCH).
+
+ If its unsigned NO_SUB bit is set, then regexec will not
+ return anything in PMATCH, but only report whether or not
+ BUFFER matched STRING.
+
+ Regardless of how its unsigned RETURN_DEFAULT_NUM_REGS bit
+ is set, regexec only returns in PMATCH information about
+ the whole pattern and NMATCH - 1 of its subexpressions.
+
+ STRING is the address of the string to be matched.
+
+ NMATCH is how many elements of PMATCH regex should fill.
+
+ PMATCH is an array of struct regex_t's. If PREG's NO_SUB field
+ isn't set, then regexec records in PMATCH[i], for i = 1 to
+ PMATCH - 1, which substring of STRING matched the i-th
+ subexpression of the regular expression compiled in BUFFER;
+ it records in PMATCH[0] that information about all of
+ STRING. See the comment for `typedef struct...regmatch_t'
+ in regex.h for more details.
+
+ The caller must allocate PMATCH to have at least NMATCH
+ elements.
+
+ EFLAGS is two bits OR-ed together which affect execution. If the
+ bit REG_NOTBOL is set, then STRING's first character is not
+ the beginning of a line; that means any beginning-of-line
+ byte command in BUFFER won't match that first character.
+ If the bit REG_NOTEOL is set, then a similar things holds
+ for STRING's last character: it isn't the end of a line and
+ any end-of-line byte command in BUFFER won't match it.
+
+
+ It returns 0 if it matches and REG_NOMATCH if it doesn't. */
+
+int
+regexec (preg, string, nmatch, pmatch, eflags)
+ const regex_t *preg;
+ const char *string;
+ size_t nmatch;
+ regmatch_t pmatch[];
+ int eflags;
+{
+ int return_value;
+ unsigned this_op;
+ struct re_registers regs;
+ regex_t private_preg;
+
+ private_preg = *preg;
+ private_preg.not_bol = eflags & REG_NOTBOL;
+ private_preg.not_eol = eflags & REG_NOTEOL;
+
+ private_preg.return_default_num_regs = 0;
+
+ if (!private_preg.no_sub && nmatch > 0)
+ {
+ regs.num_regs = nmatch;
+ regs.start = malloc (nmatch * sizeof (int));
+ regs.end = malloc (nmatch * sizeof (int));
+ }
+ else
+ {
+ regs.num_regs = 0;
+ regs.start = NULL;
+ regs.end = NULL;
+ }
+
+ return_value = re_match (&private_preg, string, strlen (string), 0,
+ !private_preg.no_sub && nmatch > 0 ? &regs : 0);
+
+ if (return_value == strlen (string))
+ {
+ if (!private_preg.no_sub && nmatch > 0)
+ {
+ unsigned this_reg;
+
+ for (this_reg = 0; this_reg < nmatch; this_reg++)
+ {
+ pmatch[this_reg].rm_so = regs.start[this_reg];
+ pmatch[this_reg].rm_eo = regs.end[this_reg];
+ }
+ }
+ }
+ if (regs.start != NULL)
+ free (regs.start);
+
+ if (regs.end != NULL)
+ free (regs.end);
+
+ return return_value == strlen (string) ? 0 : REG_NOMATCH;
+}
+
+
+/* Puts the first BUFFER_SIZE - 1 characters in BUFFER (if BUFFER isn't null)
+ and terminates it with a null.
+
+ Returns one more than the size of MESSAGE. */
+
+static size_t
+put_in_buffer (message, buffer, buffer_size)
+ char *message;
+ char *buffer;
+ size_t buffer_size;
+{
+ unsigned this_char;
+
+ if (buffer != NULL && buffer_size > 0)
+ {
+ strncpy (buffer, message, buffer_size - 1);
+ buffer[buffer_size - 1] = 0;
+ }
+
+ return strlen (message) + 1;
+}
+
+
+/* Returns a message corresponding to an error code, ERRCODE, returned
+ from either regcomp or regexec. */
+
+size_t
+re_gerror (errcode, preg, errbuf, errbuf_size)
+ int errcode;
+ const regex_t *preg;
+ char *errbuf;
+ size_t errbuf_size;
+{
+ switch (errcode)
+ {
+ case REG_NOERROR:
+ return put_in_buffer ("Regex message: no error.", errbuf, errbuf_size);
+
+ case REG_NOMATCH:
+ return put_in_buffer ("Regex error: regexec didn't find a match.",
+ errbuf, errbuf_size);
+ case REG_BADPAT:
+ return put_in_buffer ("Regex error: Invalid regular expression.",
+ errbuf, errbuf_size);
+ case REG_ECOLLATE:
+ return put_in_buffer ("Regex error: (not implemented) Invalid \
+collating character.", errbuf, errbuf_size);
+
+ case REG_ECTYPE:
+ return put_in_buffer ("Regex error: Invalid character class name.",
+ errbuf, errbuf_size);
+ case REG_EESCAPE:
+ return put_in_buffer ("Regex error: Trailing backslash.",
+ errbuf, errbuf_size);
+ case REG_ESUBREG:
+ return put_in_buffer("Regex error: Invalid back reference.",
+ errbuf, errbuf_size);
+ case REG_EBRACK:
+ return put_in_buffer ("Regex error: Unmatched [ or [^.",
+ errbuf, errbuf_size);
+ case REG_EPAREN:
+ return put_in_buffer ("Regex error: Unmatched parenthesis.",
+ errbuf, errbuf_size);
+ case REG_EBRACE:
+ return put_in_buffer ("Regex error: Unmatched \\{.",
+ errbuf, errbuf_size);
+ case REG_BADBR:
+ return put_in_buffer ("Regex error: Invalid content of \\{\\}.",
+ errbuf, errbuf_size);
+ case REG_ERANGE:
+ return put_in_buffer ("Regex error: Invalid range end.",
+ errbuf, errbuf_size);
+ case REG_ESPACE:
+ return put_in_buffer ("Regex error: Ran out of memory.",
+ errbuf, errbuf_size);
+ case REG_BADRPT:
+ return put_in_buffer ("Regex error: Preceding regular expression \
+either missing or not simple.", errbuf, errbuf_size);
+
+ case REG_EEND:
+ return put_in_buffer ("Regex error: Regular expression ended \
+prematurely.", errbuf, errbuf_size);
+
+ case REG_ESIZE:
+ return put_in_buffer ("Regex error: Excessively large regular \
+expression.", errbuf, errbuf_size);
+ }
+}
+
+
+void
+re_gfree (preg)
+ regex_t *preg;
+{
+ if (preg->buffer != NULL)
+ free (preg->buffer);
+ preg->buffer = NULL;
+
+ preg->allocated = 0;
+ preg->used = 0;
+
+ if (preg->fastmap != NULL)
+ free (preg->fastmap);
+ preg->fastmap = NULL;
+
+ preg->fastmap_accurate = 0;
+
+ if (preg->translate != NULL)
+ free (preg->translate);
+ preg->translate = NULL;
+}
+
+#endif /* not emacs */
+
#ifdef test
-#ifdef atarist
-long _stksize = 2L; /* reserve memory for stack */
-#endif
#include <stdio.h>
/* Indexed by a character, gives the upper case equivalent of the
@@ -2733,25 +5451,25 @@ char upcase[0400] =
#include "tests.h"
-typedef enum { extended_test, basic_test } test_type;
+typedef enum {extended_test, basic_test, other_test, interface_test} test_type;
/* Use this to run the tests we've thought of. */
void
main ()
{
- test_type t = extended_test;
-
+ test_type t = interface_test;
+
if (t == basic_test)
- {
- printf ("Running basic tests:\n\n");
- test_posix_basic ();
- }
+ test_posix_basic ();
else if (t == extended_test)
- {
- printf ("Running extended tests:\n\n");
- test_posix_extended ();
- }
+ test_posix_extended ();
+ else if (t == other_test)
+ test_others ();
+ else if (t == interface_test)
+ test_posix_c_interface ();
+
+ exit (0);
}
#else /* not canned */
@@ -2771,7 +5489,7 @@ main (argc, argv)
/* Allow a command argument to specify the style of syntax. */
if (argc > 1)
- obscure_syntax = atoi (argv[1]);
+ re_set_syntax (atoi (argv[1]));
buf.allocated = 40;
buf.buffer = (char *) malloc (buf.allocated);
@@ -2810,7 +5528,7 @@ main (argc, argv)
#endif
-#ifdef NOTDEF
+#if 0
print_buf (bufp)
struct re_pattern_buffer *bufp;
{
@@ -2834,7 +5552,8 @@ print_buf (bufp)
printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
}
-#endif /* NOTDEF */
+#endif /* 0 */
+
printchar (c)
char c;
@@ -2857,3 +5576,13 @@ error (string)
exit (1);
}
#endif /* test */
+
+
+
+/*
+Local variables:
+make-backup-files: t
+version-control: t
+trim-versions-without-asking: nil
+End:
+*/
diff --git a/regex.h b/regex.h
index 145b6d13..6f735156 100644
--- a/regex.h
+++ b/regex.h
@@ -1,10 +1,11 @@
/* Definitions for data structures callers pass the regex library.
-
- Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
+ Requires sys/types.h for size_t.
+ Version 0.1.
+ Copyright (C) 1985, 89, 90, 91 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 1, or (at your option)
+ the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
@@ -17,228 +18,546 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
-#ifdef __GNUC__
- #pragma once
-#endif
-
#ifndef __REGEXP_LIBRARY
#define __REGEXP_LIBRARY
-/* Define number of parens for which we record the beginnings and ends.
- This affects how much space the `struct re_registers' type takes up. */
-#ifndef RE_NREGS
-#define RE_NREGS 10
-#endif
-
-#define BYTEWIDTH 8
-
-
-/* Maximum number of duplicates an interval can allow. */
-#define RE_DUP_MAX ((1 << 15) - 1)
-/* This defines the various regexp syntaxes. */
+/* This defines the particular regexp syntax to use. */
extern int obscure_syntax;
/* The following bits are used in the obscure_syntax variable to choose among
alternative regexp syntaxes. */
-/* If this bit is set, plain parentheses serve as grouping, and backslash
- parentheses are needed for literal searching.
- If not set, backslash-parentheses are grouping, and plain parentheses
- are for literal searching. */
+/* If this bit is set, (...) defines a group, and \( and \) are literals.
+ If not set, \(...\) defines a group, and ( and ) are literals. */
#define RE_NO_BK_PARENS 1
-/* If this bit is set, plain | serves as the `or'-operator, and \| is a
- literal.
- If not set, \| serves as the `or'-operator, and | is a literal. */
+/* If this bit is set, then | is an alternation operator, and \| is literal.
+ If not set, then \| is an alternation operator, and | is literal. */
#define RE_NO_BK_VBAR (1 << 1)
-/* If this bit is not set, plain + or ? serves as an operator, and \+, \? are
- literals.
- If set, \+, \? are operators and plain +, ? are literals. */
+/* If this bit is not set, then + and ? are operators, and \+ and \? are
+ literals.
+ If set, then \+ and \? are operators and + and ? are literals. */
#define RE_BK_PLUS_QM (1 << 2)
-
-/* If this bit is set, | binds tighter than ^ or $.
+
+/* If this bit is set, then | binds tighter than ^ or $.
If not set, the contrary. */
-#define RE_TIGHT_VBAR (1 << 3)
-
-/* If this bit is set, then treat newline as an OR operator.
- If not set, treat it as a normal character. */
-#define RE_NEWLINE_OR (1 << 4)
-
-/* If this bit is set, then special characters may act as normal
- characters in some contexts. Specifically, this applies to:
- ^ -- only special at the beginning, or after ( or |;
- $ -- only special at the end, or before ) or |;
- *, +, ? -- only special when not after the beginning, (, or |.
- If this bit is not set, special characters (such as *, ^, and $)
- always have their special meaning regardless of the surrounding
- context. */
+#define RE_TIGHT_ALT (1 << 3)
+
+/* If this bit is set, newline is an alternation operator.
+ If not set, then newline is literal. */
+#define RE_NEWLINE_ALT (1 << 4)
+
+/* If this bit is set, then special characters are always special
+ regardless of where they are in the pattern.
+ If this bit is not set, then special characters are special only in
+ some contexts; otherwise they are ordinary. Specifically,
+
+ * + ? and intervals are only special when not after the beginning,
+ open-group, or alternation operator. */
#define RE_CONTEXT_INDEP_OPS (1 << 5)
-/* If this bit is not set, then \ before anything inside [ and ] is taken as
- a real \.
- If set, then such a \ escapes the following character. This is a
- special case for awk. */
+/* If this bit is not set, then \ inside a bracket expression is literal.
+ If set, then such a \ quotes the following character. */
#define RE_AWK_CLASS_HACK (1 << 6)
-/* If this bit is set, then \{ and \} or { and } serve as interval operators.
- If not set, then \{ and \} and { and } are treated as literals. */
+/* If this bit is set, then either \{...\} or {...} defines an
+ interval, depending on RE_NO_BK_BRACES.
+ If not set, then \{, \}, {, and } are literals. */
#define RE_INTERVALS (1 << 7)
-/* If this bit is not set, then \{ and \} serve as interval operators and
- { and } are literals.
- If set, then { and } serve as interval operators and \{ and \} are
- literals. */
-#define RE_NO_BK_CURLY_BRACES (1 << 8)
+/* If this bit is not set, then \{ and \} defines an interval,
+ and { and } are literals.
+ If set, then { and } defines an interval, and \{ and \} are literals. */
+#define RE_NO_BK_BRACES (1 << 8)
-/* If this bit is set, then character classes are supported; they are:
+/* If this bit is set, then character classes are supported. They are:
[:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
[:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
If not set, then character classes are not supported. */
#define RE_CHAR_CLASSES (1 << 9)
-/* If this bit is set, then the dot re doesn't match a null byte.
- If not set, it does. */
+/* If this bit is set, then period doesn't match a null.
+ If not set, then it does. */
#define RE_DOT_NOT_NULL (1 << 10)
/* If this bit is set, then [^...] doesn't match a newline.
- If not set, it does. */
-#define RE_HAT_NOT_NEWLINE (1 << 11)
+ If not set, then it does. */
+#define RE_HAT_LISTS_NOT_NEWLINE (1 << 11)
-/* If this bit is set, back references are recognized.
- If not set, they aren't. */
+/* If this bit is set, then back references are not recognized.
+ If not set, then they are. */
#define RE_NO_BK_REFS (1 << 12)
-/* If this bit is set, back references must refer to a preceding
- subexpression. If not set, a back reference to a nonexistent
- subexpression is treated as literal characters. */
-#define RE_NO_EMPTY_BK_REF (1 << 13)
-
-/* If this bit is set, bracket expressions can't be empty.
- If it is set, they can be empty. */
-#define RE_NO_EMPTY_BRACKETS (1 << 14)
+/* If this bit is set, then all back references must refer to a preceding
+ subexpression.
+ If not set, then a back reference to a nonexistent subexpression is
+ treated as literal characters. */
+#define RE_NO_MISSING_BK_REF (1 << 13)
-/* If this bit is set, then *, +, ? and { cannot be first in an re or
- immediately after a |, or a (. Furthermore, a | cannot be first or
- last in an re, or immediately follow another | or a (. Also, a ^
- cannot appear in a nonleading position and a $ cannot appear in a
- nontrailing position (outside of bracket expressions, that is). */
-#define RE_CONTEXTUAL_INVALID_OPS (1 << 15)
+/* If this bit is set, then *, +, ?, and { cannot be first in an re or
+ immediately after OR or BEGINGROUP. Furthermore, OR cannot be
+ first or last in an re, or immediately follow another OR or
+ BEGINGROUP. Also, ^ cannot appear in a nonleading position and $
+ cannot appear in a nontrailing position (outside of bracket
+ expressions, that is). */
+#define RE_CONTEXT_INVALID_OPS (1 << 14)
/* If this bit is set, then +, ? and | aren't recognized as operators.
- If it's not, they are. */
-#define RE_LIMITED_OPS (1 << 16)
+ If not set, then they are. */
+#define RE_LIMITED_OPS (1 << 15)
/* If this bit is set, then an ending range point has to collate higher
- or equal to the starting range point.
- If it's not set, then when the ending range point collates higher
- than the starting range point, the range is just considered empty. */
-#define RE_NO_EMPTY_RANGES (1 << 17)
-
-/* If this bit is set, then a hyphen (-) can't be an ending range point.
- If it isn't, then it can. */
-#define RE_NO_HYPHEN_RANGE_END (1 << 18)
-
-
-/* Define combinations of bits for the standard possibilities. */
-#define RE_SYNTAX_POSIX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR \
- | RE_CONTEXT_INDEP_OPS)
-#define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_AWK_CLASS_HACK)
-#define RE_SYNTAX_EGREP (RE_NO_BK_PARENS | RE_NO_BK_VBAR \
- | RE_CONTEXT_INDEP_OPS | RE_NEWLINE_OR)
-#define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR)
+ than or equal to the starting range point.
+ If not set, then when the ending range point collates higher than the
+ starting range point, the range is considered to be empty. */
+#define RE_NO_EMPTY_RANGES (1 << 16)
+
+/* If this bit is set, then neither the match-beginning-of-line nor
+ the match-end-of-line operator match a newline.
+ If not set, then these operators can match a newline. */
+#define RE_NO_ANCHOR_AT_NEWLINE (1 << 17)
+
+/* If this bit is set, then you can't have empty groups.
+ If not set, then you can. */
+#define RE_NO_EMPTY_GROUPS (1 << 18)
+
+/* If this bit is set, then you can't have empty alternatives.
+ If not set, then you can. */
+#define RE_NO_EMPTY_ALTS (1 << 19)
+
+/* If this bit is set, then you can't have more than one non-interval
+ repetition operators (i.e., `*', `+' and `?') in a row, e.g., as in
+ `a*+?*'.
+ If not set, then you can. */
+#define RE_NO_CONSECUTIVE_REPEATS (1 << 20)
+
+
+/* If this bit is set, then ignore anchors inside groups which in turn
+ are operated on by repetion operators.
+ If not set, then don't. */
+#define RE_REPEATED_ANCHORS_AWAY (1 << 21)
+
+/* If this bit is set, then the match-any-character operator (.) matches
+ a newline.
+ If not set, then it doesn't. */
+#define RE_DOT_NEWLINE (1 << 22)
+
+/* If this bit is set, then '^' and '$' can be anchors only at the
+ beginning or the end of the pattern.
+ If not set, then they don't have to be at the beginning or end of the
+ pattern to be anchors. */
+#define RE_ANCHORS_ONLY_AT_ENDS (1 << 23)
+
+/* If this bit is set, then Regex considers an unmatched close-group
+ operator to be the ordinary character parenthesis.
+ If not set, then an unmatched close-group operator is invalid. */
+#define RE_UNMATCHED_RIGHT_PAREN_ORD (1 << 24)
+
+/* If this bit is set, then ^ cannot appear in a nonleading position and
+ $ cannot appear in a nontrailing position (outside of bracket
+ expressions, that is). */
+#define RE_CONTEXT_INVALID_ANCHORS (1 << 25)
+
+/* If this bit is set, then ^ and $ are always anchors, regardless of
+ their positions in a regular expression.
+ If this bit is not set, then ^ is an anchor only if in a leading
+ position and $ is one only if in a trailing position. Specifically,
+
+ ^ is in a leading position if at the beginning of a regular
+ expression , or after an open-group or an alternation operator;
+
+ $ is in a trailing position if at the end of a regular
+ expression, or before close-group or an alternation operator.
+*/
+#define RE_CONTEXT_INDEP_ANCHORS (1 << 26)
+
+/* If this bit is set, then the searching and matching routines will
+ allocate enough register space to accommodate the number of groups
+ in the regular expression.
+ If this bit is not set, then the user must allocate the space. */
+#define RE_ALLOCATE_REGISTERS (1 << 27)
+
+
+/* Define combinations of the above bits for the standard possibilities. */
#define RE_SYNTAX_EMACS 0
-#define RE_SYNTAX_POSIX_BASIC (RE_INTERVALS | RE_BK_PLUS_QM \
- | RE_CHAR_CLASSES | RE_DOT_NOT_NULL \
- | RE_HAT_NOT_NEWLINE | RE_NO_EMPTY_BK_REF \
- | RE_NO_EMPTY_BRACKETS | RE_LIMITED_OPS \
- | RE_NO_EMPTY_RANGES | RE_NO_HYPHEN_RANGE_END)
-
-#define RE_SYNTAX_POSIX_EXTENDED (RE_INTERVALS | RE_NO_BK_CURLY_BRACES \
- | RE_NO_BK_VBAR | RE_NO_BK_PARENS \
- | RE_HAT_NOT_NEWLINE | RE_CHAR_CLASSES \
- | RE_NO_EMPTY_BRACKETS | RE_CONTEXTUAL_INVALID_OPS \
- | RE_NO_BK_REFS | RE_NO_EMPTY_RANGES \
- | RE_NO_HYPHEN_RANGE_END)
-
-
-/* This data structure is used to represent a compiled pattern. */
+
+#define RE_SYNTAX_AWK \
+ (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_ALLOCATE_REGISTERS \
+ | RE_AWK_CLASS_HACK)
+
+#define RE_SYNTAX_POSIX_AWK RE_SYNTAX_AWK
+
+#define RE_SYNTAX_GREP \
+ (RE_BK_PLUS_QM | RE_NEWLINE_ALT | RE_ALLOCATE_REGISTERS)
+
+#define RE_SYNTAX_EGREP \
+ (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS \
+ | RE_CONTEXT_INDEP_ANCHORS | RE_NEWLINE_ALT | RE_ALLOCATE_REGISTERS)
+
+#define RE_SYNTAX_POSIX_BASIC \
+ (RE_INTERVALS | RE_CHAR_CLASSES | RE_DOT_NOT_NULL \
+ | RE_NO_MISSING_BK_REF | RE_LIMITED_OPS | RE_NO_EMPTY_RANGES \
+ | RE_NO_ANCHOR_AT_NEWLINE | RE_DOT_NEWLINE | RE_ALLOCATE_REGISTERS)
+
+#define RE_SYNTAX_POSIX_EXTENDED \
+ (RE_INTERVALS | RE_NO_BK_BRACES | RE_NO_BK_VBAR \
+ | RE_NO_BK_PARENS | RE_CHAR_CLASSES | RE_CONTEXT_INVALID_OPS \
+ | RE_NO_BK_REFS | RE_NO_EMPTY_RANGES | RE_UNMATCHED_RIGHT_PAREN_ORD \
+ | RE_DOT_NOT_NULL | RE_NO_EMPTY_GROUPS | RE_NO_EMPTY_ALTS \
+ | RE_NO_ANCHOR_AT_NEWLINE | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS\
+ | RE_ALLOCATE_REGISTERS)
+
+
+
+/* Maximum number of duplicates an interval can allow. */
+#define RE_DUP_MAX ((1 << 15) - 1)
+
+
+/* POSIX cflags bits (i.e., information for regcomp). */
+
+/* If this bit is set, then use extended regular expression syntax.
+ If not set, then use basic regular expression syntax. */
+#define REG_EXTENDED 1
+
+/* If this bit is set, then (line 526, p.687 of POSIX 1003.2/D10)
+ newline loses its special significance; i.e., anchors do not match at
+ newlines in the string.
+ If not set, then anchors do match at newlines. */
+#define REG_NEWLINE (1 << 1)
+
+/* If this bit is set, then ignore case when matching.
+ If not set, then case is significant. */
+#define REG_ICASE (1 << 2)
+
+/* If this bit is set, then report only success or fail in regexec ().
+ If not set, then return nonzero indicating either not match or an error. */
+#define REG_NOSUB (1 << 3)
+
+
+/* POSIX eflags bits (i.e., information for regexec). */
+
+/* If this bit is set, then the string's first character is not the
+ beginning of a line, so the beginning-of-line anchor shouldn't
+ match it.
+ If not set, then the string's first character can match the
+ beginning-of-line anchor. */
+#define REG_NOTBOL 1
+
+/* If this bit is set, then the string's last character is not the
+ end of a line, so the end-of-line anchor shouldn't match it.
+ If not set, then the string's last character can match the
+ end-of-line anchor. */
+#define REG_NOTEOL (1 << 1)
+
+
+/* POSIX regexec return error value. */
+
+#define REG_NOMATCH 1 /* Didn't find a match. */
+
+/* POSIX regcomp return error codes. */
+
+#define REG_BADPAT 2 /* Found an invalid pattern. */
+#define REG_ECOLLATE 3 /* Not implemented. */
+#define REG_ECTYPE 4 /* Found an invalid character class name. */
+#define REG_EESCAPE 5 /* Found a trailing backslash. */
+#define REG_ESUBREG 6 /* Found an invalid back reference. */
+#define REG_EBRACK 7 /* Found an unmatched left bracket. */
+#define REG_EPAREN 8 /* Found a parentheses imbalance. */
+#define REG_EBRACE 9 /* Found an unmatched \{. */
+#define REG_BADBR 10 /* Found invalid contents of \{\}. */
+#define REG_ERANGE 11 /* Found invalid range end. */
+#define REG_ESPACE 12 /* Ran out of memory. */
+#define REG_BADRPT 13 /* No preceding re for repetition op. */
+#define REG_ENEWLINE 14 /* Not implemented. */
+
+/* Some regcomp codes we've added. */
+#define REG_NOERROR 0 /* No error. */
+#define REG_EEND 15
+#define REG_ESIZE 16
+
+
+
+
+/* This data structure represents a compiled pattern. Before calling
+ the pattern compiler, the fields `buffer', `allocated', `fastmap',
+ `translate', and `no_sub' can be set. After the pattern has been
+ compiled, the `re_nsub' field is available. All other fields are
+ private to the regex routines. */
+
+/* If this changes, change documentation in regex.texinfo. */
struct re_pattern_buffer
- {
- char *buffer; /* Space holding the compiled pattern commands. */
- long allocated; /* Size of space that `buffer' points to. */
- long used; /* Length of portion of buffer actually occupied */
- char *fastmap; /* Pointer to fastmap, if any, or zero if none. */
- /* re_search uses the fastmap, if there is one,
- to skip over totally implausible characters. */
- char *translate; /* Translate table to apply to all characters before
- comparing, or zero for no translation.
- The translation is applied to a pattern when it is
- compiled and to data when it is matched. */
- char fastmap_accurate;
- /* Set to zero when a new pattern is stored,
- set to one when the fastmap is updated from it. */
- char can_be_null; /* Set to one by compiling fastmap
- if this pattern might match the null string.
- It does not necessarily match the null string
- in that case, but if this is zero, it cannot.
- 2 as value means can match null string
- but at end of range or before a character
- listed in the fastmap. */
- };
-
-
-/* search.c (search_buffer) needs this one value. It is defined both in
- regex.c and here. */
+{
+ /* Space that holds the compiled pattern. */
+ char *buffer;
+
+ /* Number of bytes to which `buffer' points. */
+ long allocated;
+
+ /* Number of bytes actually used in `buffer'. */
+ long used;
+
+ /* Syntax setting with which the pattern was compiled. */
+ int syntax;
+
+ /* Pointer to a fastmap, if any, otherwise zero. re_search uses
+ the fastmap, if there is one, to skip over impossible
+ starting points for matches. */
+ char *fastmap;
+
+ /* Either a translate table to apply to all characters before
+ comparing them, or zero for no translation. The translation
+ is applied to a pattern when it is compiled and to a string
+ when it is matched. */
+ char *translate;
+
+ /* Number of subexpressions found by the compiler. */
+ size_t re_nsub;
+
+ /* Set to 1 by re_compile_fastmap if this pattern can match the
+ null string; 0 prevents the searcher from matching it with
+ the null string. Set to 2 if it might match the null string
+ either at the end of a search range or just before a
+ character listed in the fastmap. */
+ char can_be_null;
+
+
+ /* The remaining fields are all one-bit booleans. */
+
+ /* Set to zero when regex_compile compiles a pattern; set to one
+ by re_compile_fastmap when it updates the fastmap, if any. */
+ unsigned fastmap_accurate : 1;
+
+ /* If set, regexec reports only success or failure and does not
+ return anything in pmatch[]. */
+ unsigned no_sub : 1;
+
+ /* If set, a beginning-of-line anchor never matches. */
+ unsigned not_bol : 1;
+
+ /* Similarly for an end-of-line anchor. */
+ unsigned not_eol : 1;
+
+ /* If set, and the regs argument is nonzero, the GNU
+ matching and searching functions return information
+ for as many registers as needed to report about the
+ whole pattern and all its subexpressions. If not set,
+ and the regs argument is nonzero, then the functions
+ return information for regs->num_regs registers. */
+ unsigned return_default_num_regs : 1;
+};
+
+typedef struct re_pattern_buffer regex_t;
+
+
+/* search.c (search_buffer) in Emacs needs this one value. It is
+ defined both in `regex.c' and here. */
#define RE_EXACTN_VALUE 1
-/* Structure to store register contents data in.
+
+/* struct re_registers: Structure to store register contents data in.
+
+ (If change comments here, change in regex.texinfo also.)
+
+ Some groups in a regular expression match (possibly empty) substrings
+ of the string that regular expression matched. The matcher remembers
+ the beginning and ending point of the substring matched by each
+ group. To get what they matched, pass the address of a structure of
+ this type to a GNU matching or searching function.
+
+ When you call a GNU matching and searching function, it stores
+ information into this structure according to the following (in all
+ examples below, `(' represents the open-group and `)' the
+ close-group operator):
+
+ If the regular expression has an i-th group that matches a substring
+ of string, then the function sets REGS->start[i] to the index in
+ string where the substring matched by the i-th group begins, and
+ REGS->end[i] to the index just beyond that substring's end. The
+ function sets REGS->start[0] and REGS->end[0] to analogous
+ information about the entire pattern.
+
+ For example, when you match the pattern `((a)(b))' with the string
+ `ab', you get:
+
+ 0 in REGS->start[0] and 2 in REGS->end[0]
+ 0 in REGS->start[1] and 2 in REGS->end[1]
+ 0 in REGS->start[2] and 1 in REGS->end[2]
+ 1 in REGS->start[3] and 2 in REGS->end[3]
+
+ If a group matches more than once (as it might if followed by, e.g.,
+ a repetition operator), then the function reports the information
+ about what the group @emph{last matched.
+
+ For example, when you match the string `aa' with the pattern `(a)*',
+ you get:
+
+ 0 in REGS->start[0] and 2 in REGS->end[0]
+ 1 in REGS->start[1] and 2 in REGS->end[1]
+
+
+ If the i-th group does not participate in a successful match, e.g.,
+ it is an alternative not taken or a repetition operator allows zero
+ repetitions of it, then the function sets REGS->start[i] and
+ REGS->end[i] to -1.
+
+ For example, when you match the string `b' with the pattern `(a)*b',
+ you get:
+
+ 0 in REGS->start[0] and 1 in REGS->end[0]
+ -1 in REGS->start[1] and -1 in REGS->end[1]
+
+
+ If the i-th group matches a zero-length string, then the function
+ sets REGS->start[i] and REGS->end[i] to the index just beyond that
+ zero-length string.
+
+ For example, when you match the string `b' with the pattern `(a*)b',
+ you get:
+
+ 0 in REGS->start[0] and 1 in REGS->end[0]
+ 0 in REGS->start[1] and 0 in REGS->end[1]
+
+ The function sets REGS->start[0] and REGS->end[0] to analogous
+ information about the entire pattern.
+
+ For example, when you match the empty string with the pattern `(a*)',
+ you get:
+
+ 0 in REGS->start[0] and 0 in REGS->end[0]
+ 0 in REGS->start[1] and 0 in REGS->end[1]
+
+ If an i-th group contains a j-th group and the function reports a
+ match of the i-th group, then it records in REGS->start[j] and
+ REGS->end[j] the last match (if it matched) of the j-th group.
+
+ For example, when you match the string `abb' with the pattern
+ `((a*)b)*, group 2' last matches the empty string, so you get:
+
+ 0 in REGS->start[0] and 3 in REGS->end[0]
+ 2 in REGS->start[1] and 3 in REGS->end[1]
+ 2 in REGS->start[2] and 2 in REGS->end[2]
+
+ When you match the string `abb' with the pattern `((a)*b)*', group 2
+ doesn't participate in the last match, so you get:
- Pass the address of such a structure as an argument to re_match, etc.,
- if you want this information back.
+ 0 in REGS->start[0] and 3 in REGS->end[0]
+ 2 in REGS->start[1] and 3 in REGS->end[1]
+ 0 in REGS->start[2] and 1 in REGS->end[2]
- For i from 1 to RE_NREGS - 1, start[i] records the starting index in
- the string of where the ith subexpression matched, and end[i] records
- one after the ending index. start[0] and end[0] are analogous, for
- the entire pattern. */
+ If an i-th group contains a j-th group and the function sets
+ REGS->start[i] and REGS->end[i] to -1, then it also sets REGS->start[j]
+ and REGS->end[j] to -1.
+
+ For example, when you match the string `c' with the pattern
+ `((a)*b)*c', you get:
+
+ 0 in REGS->start[0] and 1 in REGS->end[0]
+ -1 in REGS->start[1] and -1 in REGS->end[1]
+ -1 in REGS->start[2] and -1 in REGS->end[2]
+*/
struct re_registers
- {
- int start[RE_NREGS];
- int end[RE_NREGS];
- };
+{
+ unsigned num_regs;
+ int *start;
+ int *end;
+};
+
+
+/* POSIX specification for registers. See comments for struct
+ re_registers for how this is used and read `POSIX' for `GNU',
+ `PMATCH' for `REGS', `PMATCH[i].rm_so' for `REGS->start' and
+ `PMATCH[i].rm_eo' for `REGS->end'. */
+
+typedef off_t regoff_t;
+
+typedef struct
+{
+ regoff_t rm_so; /* Byte offset from string's start to substring' start. */
+ regoff_t rm_eo; /* Byte offset from string's end to substring' end. */
+} regmatch_t;
#ifdef __STDC__
-extern char *re_compile_pattern (char *, size_t, struct re_pattern_buffer *);
-/* Is this really advertised? */
-extern void re_compile_fastmap (struct re_pattern_buffer *);
-extern int re_search (struct re_pattern_buffer *, char*, int, int, int,
- struct re_registers *);
-extern int re_search_2 (struct re_pattern_buffer *, char *, int,
- char *, int, int, int,
- struct re_registers *, int);
-extern int re_match (struct re_pattern_buffer *, char *, int, int,
- struct re_registers *);
-extern int re_match_2 (struct re_pattern_buffer *, char *, int,
- char *, int, int, struct re_registers *, int);
+/* Compile the regular expression PATTERN, with length LENGTH
+ and syntax given by the global `obscure_syntax', into the buffer
+ BUFFER. Return NULL if successful, and an error string if not. */
+
+extern char *re_compile_pattern (const char *pattern, const int length,
+ struct re_pattern_buffer *buffer);
+
+
+/* Compile a fastmap for the compiled pattern in BUFFER; used to
+ accelerate searches. Return 0 if successful and -2 if was an
+ internal error. */
+
+extern int re_compile_fastmap (struct re_pattern_buffer *buffer);
+
+
+/* Search in the string STRING (with length LENGTH) for the pattern
+ compiled into BUFFER. Start searching at position START, for RANGE
+ characters. Return the starting position of the match or -1 for no
+ match, or -2 for an internal error. Also return register
+ information in REGS (if REGS is non-null). */
+
+extern int re_search (struct re_pattern_buffer *buffer,
+ const char *string, const int length,
+ const int start, const int range,
+ struct re_registers *regs);
+
+/* Like `re_search', but search in the concatenation of STRING1 and
+ STRING2. Also, stop searching at index START + STOP. */
+
+extern int re_search_2 (struct re_pattern_buffer *buffer,
+ const char *string1, const int length1,
+ const char *string2, const int length2,
+ const int start, const int range,
+ struct re_registers *regs,
+ const int stop);
+
+/* Like `re_search', but return how many characters in STRING the regexp
+ in BUFFER matched, starting at position START. */
+
+extern int re_match (const struct re_pattern_buffer *buffer,
+ const char *string, const int length,
+ const int start, struct re_registers *regs);
+
+
+/* Relates to `re_match' as `re_search_2' relates to `re_search'. */
+
+extern int re_match_2 (const struct re_pattern_buffer *buffer,
+ const char *string1, const int length1,
+ const char *string2, const int length2,
+ const int start,
+ struct re_registers *regs,
+ const int stop);
+
/* 4.2 bsd compatibility. */
-extern char *re_comp (char *);
-extern int re_exec (char *);
+extern char *re_comp (const char *);
+extern int re_exec (const char *);
-#else /* !__STDC__ */
+extern int regcomp (regex_t *preg, const char *pattern, int cflags);
+extern int regexec (const regex_t *preg, const char *string, size_t nmatch,
+ regmatch_t pmatch[], int eflags);
+extern size_t re_gerror (int errcode, const regex_t *preg, char* errbuf,
+ size_t errbuf_size);
+extern void re_gfree (regex_t *preg);
+
+#else /* not __STDC__ */
+
+/* Support old C compilers. */
+#define const
extern char *re_compile_pattern ();
-/* Is this really advertised? */
-extern void re_compile_fastmap ();
extern int re_search (), re_search_2 ();
extern int re_match (), re_match_2 ();
@@ -246,6 +565,11 @@ extern int re_match (), re_match_2 ();
extern char *re_comp ();
extern int re_exec ();
+extern int regcomp ();
+extern int regexec ();
+extern size_t re_gerror ();
+extern void re_gfree ();
+
#endif /* __STDC__ */
@@ -253,4 +577,14 @@ extern int re_exec ();
extern char *re_syntax_table;
#endif
-#endif /* !__REGEXP_LIBRARY */
+#endif /* not __REGEXP_LIBRARY */
+
+
+
+/*
+Local variables:
+make-backup-files: t
+version-control: t
+trim-versions-without-asking: nil
+End:
+*/
diff --git a/support/makeinfo.patch b/support/makeinfo.patch
new file mode 100644
index 00000000..7d2d307f
--- /dev/null
+++ b/support/makeinfo.patch
@@ -0,0 +1,233 @@
+*** makeinfo.c.dist Thu Sep 26 21:45:04 1991
+--- makeinfo.c Thu Oct 24 21:04:19 1991
+***************
+*** 73,78 ****
+--- 73,79 ----
+ /* Forward declarations. */
+ unsigned char *xmalloc (), *xrealloc ();
+ extern int in_fixed_width_font;
++ extern int the_current_enumerate_type;
+
+ /* Some systems don't declare this function in pwd.h. */
+ struct passwd *getpwnam ();
+***************
+*** 925,931 ****
+ extern int executing_string;
+ FSTACK *temp = filestack;
+
+! if (!filestack)
+ abort (); /* My fault. I wonder what I did? */
+
+ /* Make sure that commands with braces have been satisfied. */
+--- 926,932 ----
+ extern int executing_string;
+ FSTACK *temp = filestack;
+
+! if (!filestack && !executing_string)
+ abort (); /* My fault. I wonder what I did? */
+
+ /* Make sure that commands with braces have been satisfied. */
+***************
+*** 941,947 ****
+
+ /* Pop the stack. */
+ filestack = filestack->next;
+! free (temp);
+ pop_node_filename ();
+ }
+
+--- 942,949 ----
+
+ /* Pop the stack. */
+ filestack = filestack->next;
+! if (temp)
+! free (temp);
+ pop_node_filename ();
+ }
+
+***************
+*** 2586,2593 ****
+ case enumerate:
+ inhibit_paragraph_indentation = 0;
+ current_indent += default_indentation_increment;
+- start_numbering (1);
+ filling_enabled = indented_fill = true;
+ break;
+
+ case alphaenumerate:
+--- 2588,2610 ----
+ case enumerate:
+ inhibit_paragraph_indentation = 0;
+ current_indent += default_indentation_increment;
+ filling_enabled = indented_fill = true;
++
++ if (*(insertion_stack->item_function))
++ {
++ if (isalpha (*(insertion_stack->item_function)))
++ {
++ start_lettering (*(insertion_stack->item_function));
++ }
++ else
++ {
++ int n = atoi (insertion_stack->item_function);
++
++ start_numbering (n);
++ }
++ }
++ else
++ start_numbering (1);
+ break;
+
+ case alphaenumerate:
+***************
+*** 2687,2693 ****
+ break;
+
+ case enumerate:
+! stop_numbering ();
+ current_indent -= default_indentation_increment;
+ break;
+
+--- 2704,2714 ----
+ break;
+
+ case enumerate:
+! if (isalpha (the_current_enumerate_type))
+! stop_lettering ();
+! else
+! stop_numbering ();
+!
+ current_indent -= default_indentation_increment;
+ break;
+
+***************
+*** 2770,2775 ****
+--- 2791,2800 ----
+ int letter_offset = 0;
+ int the_current_letter = 0;
+
++ int enumerate_type_stack[max_ns];
++ int enumerate_type_offset = 0;
++ int the_current_enumerate_type = '1';
++
+ start_numbering (at_number)
+ int at_number;
+ {
+***************
+*** 2778,2783 ****
+--- 2803,2815 ----
+ line_error ("Enumeration stack overflow");
+ return;
+ }
++ if (enumerate_type_offset + 1 == max_ns)
++ {
++ line_error ("Enumeration stack overflow");
++ return;
++ }
++ the_current_enumerate_type =
++ enumerate_type_stack[enumerate_type_offset++] = '1';
+ number_stack[number_offset++] = the_current_number;
+ the_current_number = at_number;
+ }
+***************
+*** 2787,2792 ****
+--- 2819,2831 ----
+ the_current_number = number_stack[--number_offset];
+ if (number_offset < 0)
+ number_offset = 0;
++ --enumerate_type_offset;
++ the_current_enumerate_type = enumerate_type_stack[enumerate_type_offset-1];
++ if (enumerate_type_offset < 0)
++ {
++ enumerate_type_offset = 0;
++ the_current_enumerate_type = '1';
++ }
+ }
+
+ start_lettering (at_letter)
+***************
+*** 2797,2802 ****
+--- 2836,2848 ----
+ line_error ("Alpha-enumeration stack overflow");
+ return;
+ }
++ if (enumerate_type_offset + 1 == max_ns)
++ {
++ line_error ("Enumeration stack overflow");
++ return;
++ }
++ the_current_enumerate_type =
++ enumerate_type_stack[enumerate_type_offset++] = 'a';
+ letter_stack[letter_offset++] = the_current_letter;
+ the_current_letter = at_letter;
+ }
+***************
+*** 2806,2811 ****
+--- 2852,2864 ----
+ the_current_letter = letter_stack[--letter_offset];
+ if (letter_offset < 0)
+ letter_offset = 0;
++ --enumerate_type_offset;
++ the_current_enumerate_type = enumerate_type_stack[enumerate_type_offset-1];
++ if (enumerate_type_offset < 0)
++ {
++ enumerate_type_offset = 0;
++ the_current_enumerate_type = '1';
++ }
+ }
+
+ /* Place a letter into the output stream. */
+***************
+*** 4307,4319 ****
+ temp++;
+ else
+ {
+! if (input_text[temp] == '.' ||
+! input_text[temp] == ',' ||
+ input_text[temp] == '\t')
+ return;
+ else
+ {
+! line_error ("Cross-reference must be terminated with a period or a comma");
+ return;
+ }
+ }
+--- 4360,4372 ----
+ temp++;
+ else
+ {
+! if (member (input_text[temp], ".,;:") ||
+! (px_ref_flag && input_text[temp] == ')' ) ||
+ input_text[temp] == '\t')
+ return;
+ else
+ {
+! line_error ("Cross-reference must be terminated with punctuation");
+ return;
+ }
+ }
+***************
+*** 4729,4736 ****
+ output_column++;
+ }
+ else if (current_insertion_type () == enumerate)
+! number_item ();
+! else
+ letter_item ();
+
+ /* Special hack. This makes close paragraph ignore you until
+--- 4782,4794 ----
+ output_column++;
+ }
+ else if (current_insertion_type () == enumerate)
+! {
+! if (isalpha (the_current_enumerate_type))
+! letter_item ();
+! else
+! number_item ();
+! }
+! else /* alphaenumerate or capsenumerate */
+ letter_item ();
+
+ /* Special hack. This makes close paragraph ignore you until
+
diff --git a/support/texindex.c b/support/texindex.c
index 33b5fdbc..0933aa62 100644
--- a/support/texindex.c
+++ b/support/texindex.c
@@ -1,128 +1,63 @@
/* Prepare Tex index dribble output into an actual index.
Copyright (C) 1987 Free Software Foundation, Inc.
- NO WARRANTY
-
- BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
-NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
-WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
-RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
-WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
-BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
-AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
-DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
-CORRECTION.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
-STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
-WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
-LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
-OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
-DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
-A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
-PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
-
- GENERAL PUBLIC LICENSE TO COPY
-
- 1. You may copy and distribute verbatim copies of this source file
-as you receive it, in any medium, provided that you conspicuously
-and appropriately publish on each copy a valid copyright notice
-"Copyright (C) 1987 Free Software Foundation, Inc.", and include
-following the copyright notice a verbatim copy of the above disclaimer
-of warranty and of this License.
-
- 2. You may modify your copy or copies of this source file or
-any portion of it, and copy and distribute such modifications under
-the terms of Paragraph 1 above, provided that you also do the following:
-
- a) cause the modified files to carry prominent notices stating
- that you changed the files and the date of any change; and
-
- b) cause the whole of any work that you distribute or publish,
- that in whole or in part contains or is a derivative of this
- program or any part thereof, to be licensed at no charge to all
- third parties on terms identical to those contained in this
- License Agreement (except that you may choose to grant more extensive
- warranty protection to some or all third parties, at your option).
-
- c) You may charge a distribution fee for the physical act of
- transferring a copy, and you may at your option offer warranty
- protection in exchange for a fee.
-
-Mere aggregation of another unrelated program with this program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other program under the scope of these terms.
-
- 3. You may copy and distribute this program (or a portion or derivative
-of it, under Paragraph 2) in object code or executable form under the terms
-of Paragraphs 1 and 2 above provided that you also do one of the following:
-
- a) accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- b) accompany it with a written offer, valid for at least three
- years, to give any third party free (except for a nominal
- shipping charge) a complete machine-readable copy of the
- corresponding source code, to be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- c) accompany it with the information you received as to where the
- corresponding source code may be obtained. (This alternative is
- allowed only for noncommercial distribution and only if you
- received the program in object code or executable form alone.)
-
-For an executable file, complete source code means all the source code for
-all modules it contains; but, as a special exception, it need not include
-source code for modules which are standard libraries that accompany the
-operating system on which the executable file runs.
-
- 4. You may not copy, sublicense, distribute or transfer this program
-except as expressly provided under this License Agreement. Any attempt
-otherwise to copy, sublicense, distribute or transfer this program is void and
-your rights to use the program under this License agreement shall be
-automatically terminated. However, parties who have received computer
-software programs from you with this License Agreement will not have
-their licenses terminated so long as such parties remain in full compliance.
-
- 5. If you wish to incorporate parts of this program into other free
-programs whose distribution conditions are different, write to the Free
-Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
-worked out a simple rule that can be stated here, but we will often permit
-this. We will be guided by the two goals of preserving the free status of
-all derivatives of our free software and of promoting the sharing and reuse of
-software.
-
- In other words, you are welcome to use, share and improve this program.
- You are forbidden to forbid anyone else to use, share and improve
- what you give them. Help stamp out software-hoarding! */
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 1, or (at your option)
+ any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <stdio.h>
#include <ctype.h>
+#include <errno.h>
#ifdef VMS
+#ifndef VAX11C
+#define noshare
+#endif
+
+#include <perror.h>
#include <file.h>
#define EXIT_SUCCESS ((1 << 28) | 1)
#define EXIT_FATAL ((1 << 28) | 4)
#define unlink delete
#define tell(fd) lseek(fd, 0L, 1)
-#else
+
+#else /* Not VMS */
+
+#ifdef USG
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#endif
#include <sys/file.h>
#define EXIT_SUCCESS 0
#define EXIT_FATAL 1
-#endif
+
+#endif /* Not VMS */
#ifndef L_XTND
#define L_XTND 2
#endif
+#ifdef VMS
+extern noshare int sys_nerr;
+extern noshare char *sys_errlist[];
+#else
+extern int sys_nerr;
+extern char *sys_errlist[];
+#endif
+
/* When sorting in core, this structure describes one line
and the position and length of its first keyfield. */
@@ -179,7 +114,7 @@ char **linearray;
/* The allocated length of `linearray'. */
-long lines;
+long nlines;
/* Directory to use for temporary files. On Unix, it ends with a slash. */
@@ -650,7 +585,9 @@ find_braced_pos (str, words, chars, ignore_blanks)
c = *p++;
if (c == '{') bracelevel++;
if (c == '}') bracelevel--;
- if (c == '\\') c = *p++; /* \ quotes braces and \ */
+#if 0
+ if (c == '\\' || c == '@') c = *p++; /* \ quotes braces and \ */
+#endif
if (c == 0 || c == '\n') return p-1;
}
}
@@ -688,7 +625,9 @@ find_braced_end (str)
c = *p++;
if (c == '{') bracelevel++;
if (c == '}') bracelevel--;
- if (c == '\\') c = *p++;
+#if 0
+ if (c == '\\' || c == '@') c = *p++;
+#endif
if (c == 0 || c == '\n') return p-1;
}
return p - 1;
@@ -875,7 +814,7 @@ sort_offline (infile, nfiles, total, outfile)
linelength = readline (&lb, istream);
- if (lb.buffer[0] != '\\')
+ if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
{
error ("%s: not a texinfo index file", infile);
return;
@@ -907,7 +846,7 @@ sort_offline (infile, nfiles, total, outfile)
linelength = readline (&lb, istream);
if (!linelength && feof (istream)) break;
- if (lb.buffer[0] != '\\')
+ if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
{
error ("%s: not a texinfo index file", infile);
failure = 1;
@@ -980,7 +919,7 @@ sort_in_core (infile, total, outfile)
close (desc);
- if (file_size > 0 && data[0] != '\\')
+ if (file_size > 0 && data[0] != '\\' && data[0] != '@')
{
error ("%s: not a texinfo index file", infile);
return;
@@ -994,12 +933,12 @@ sort_in_core (infile, total, outfile)
/* Create the array of pointers to lines, with a default size frequently enough. */
- lines = total / 50;
- if (!lines) lines = 2;
- linearray = (char **) xmalloc (lines * sizeof (char *));
+ nlines = total / 50;
+ if (!nlines) nlines = 2;
+ linearray = (char **) xmalloc (nlines * sizeof (char *));
/* `nextline' points to the next free slot in this array.
- `lines' is the allocated size. */
+ `nlines' is the allocated size. */
nextline = linearray;
@@ -1081,7 +1020,7 @@ parsefile (filename, nextline, data, size)
while (p != end)
{
- if (p[0] != '\\')
+ if (p[0] != '\\' && p[0] != '@')
return 0;
*line = p;
@@ -1089,10 +1028,10 @@ parsefile (filename, nextline, data, size)
if (p != end) p++;
line++;
- if (line == linearray + lines)
+ if (line == linearray + nlines)
{
char **old = linearray;
- linearray = (char **) xrealloc (linearray, sizeof (char *) * (lines *= 4));
+ linearray = (char **) xrealloc (linearray, sizeof (char *) * (nlines *= 4));
line += linearray - old;
}
}
@@ -1580,14 +1519,6 @@ error (s1, s2)
perror_with_name (name)
char *name;
{
-#ifdef VMS
-#include <errno.h>
- extern noshare int sys_nerr;
- extern noshare char *sys_errlist[];
-#else
- extern int errno, sys_nerr;
- extern char *sys_errlist[];
-#endif
char *s;
if (errno < sys_nerr)
@@ -1600,8 +1531,6 @@ perror_with_name (name)
pfatal_with_name (name)
char *name;
{
- extern int errno, sys_nerr;
- extern char *sys_errlist[];
char *s;
if (errno < sys_nerr)
@@ -1659,13 +1588,16 @@ bzero (b, length)
#ifdef VMS
short zero = 0;
long max_str = 65535;
+ long len;
- while (length > max_str) {
- (void) LIB$MOVC5 (&zero, &zero, &zero, &max_str, b);
- length -= max_str;
- b += max_str;
- }
- (void) LIB$MOVC5 (&zero, &zero, &zero, &length, b);
+ while (length > max_str)
+ {
+ (void) LIB$MOVC5 (&zero, &zero, &zero, &max_str, b);
+ length -= max_str;
+ b += max_str;
+ }
+ len = length;
+ (void) LIB$MOVC5 (&zero, &zero, &zero, &len, b);
#else
while (length-- > 0)
*b++ = 0;
diff --git a/support/texinfo.tex b/support/texinfo.tex
index 0810241a..31fc8151 100644
--- a/support/texinfo.tex
+++ b/support/texinfo.tex
@@ -1,34 +1,39 @@
%% TeX macros to handle texinfo files
-% Copyright (C) 1985, 1986, 1988 Free Software Foundation, Inc.
+% Copyright (C) 1985, 1986, 1988, 1990, 1991 Free Software Foundation, Inc.
-%GNU CC is free software; you can redistribute it and/or modify
-%it under the terms of the GNU General Public License as published by
-%the Free Software Foundation; either version 1, or (at your option)
-%any later version.
+%This texinfo.tex file is free software; you can redistribute it and/or
+%modify it under the terms of the GNU General Public License as
+%published by the Free Software Foundation; either version 2, or (at
+%your option) any later version.
-%GNU CC is distributed in the hope that it will be useful,
-%but WITHOUT ANY WARRANTY; without even the implied warranty of
-%MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-%GNU General Public License for more details.
+%This texinfo.tex file is distributed in the hope that it will be
+%useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+%of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+%General Public License for more details.
%You should have received a copy of the GNU General Public License
-%along with GNU CC; see the file COPYING. If not, write to
-%the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+%along with this texinfo.tex file; see the file COPYING. If not, write
+%to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+%USA.
%In other words, you are welcome to use, share and improve this program.
%You are forbidden to forbid anyone else to use, share and improve
%what you give them. Help stamp out software-hoarding!
-\def\texinfoversion{2.1}
+\def\texinfoversion{2.53}
\message{Loading texinfo package [Version \texinfoversion]:}
\message{}
+% Print the version number if in a .fmt file.
+\everyjob{\message{[Texinfo version \texinfoversion]}\message{}}
+
% Save some parts of plain tex whose names we will redefine.
\let\ptexlbrace=\{
\let\ptexrbrace=\}
+\let\ptexdots=\dots
\let\ptexdot=\.
\let\ptexstar=\*
\let\ptexend=\end
@@ -75,8 +80,9 @@
\def\onepageout#1{\hoffset=\normaloffset
\ifodd\pageno \advance\hoffset by \bindingoffset
\else \advance\hoffset by -\bindingoffset\fi
+{\escapechar=`\\\relax % makes sure backslash is used in output files.
\shipout\vbox{{\let\hsize=\pagewidth \makeheadline} \pagebody{#1}%
- {\let\hsize=\pagewidth \makefootline}}
+{\let\hsize=\pagewidth \makefootline}}}%
\advancepageno \ifnum\outputpenalty>-20000 \else\dosupereject\fi}
@@ -203,7 +209,7 @@
% @@ prints an @
% Kludge this until the fonts are right (grr).
-\def\@{{\sf \char '100}}
+\def\@{{\tt \char '100}}
% Define @` and @' to be the same as ` and '
% but suppressing ligatures.
@@ -221,7 +227,7 @@
\def\:{\spacefactor=1000 }
% @* forces a line break.
-\def\*{\hfil\break}
+\def\*{\hfil\break\hbox{}\ignorespaces}
% @. is an end-of-sentence period.
\def\.{.\spacefactor=3000 }
@@ -232,9 +238,26 @@
% @group ... @end group forces ... to be all on one page.
\def\group{\begingroup% \inENV ???
+\ifnum\catcode13=\active \else
+\errmessage{@group invalid in context where filling is enabled}\fi
\def \Egroup{\egroup\endgroup}
\vbox\bgroup}
+% @need space-in-mils
+% forces a page break if there is not space-in-mils remaining.
+
+\newdimen\mil \mil=0.001in
+
+\def\need{\parsearg\needx}
+
+\def\needx #1{\par %
+% This method tries to make TeX break the page naturally
+% if the depth of the box does not fit.
+{\baselineskip=0pt%
+\vtop to #1\mil{\vfil}\kern -#1\mil\penalty 10000
+\prevdepth=-1000pt
+}}
+
% @br forces paragraph break
\let\br = \par
@@ -285,9 +308,10 @@
% @c is the same as @comment
% @ignore ... @end ignore is another way to write a comment
-\def\comment{\parsearg \commentxxx}
+\def\comment{\catcode 64=\other \catcode 123=\other \catcode 125=\other%
+\parsearg \commentxxx}
-\def\commentxxx #1{}
+\def\commentxxx #1{\catcode 64=0 \catcode 123=1 \catcode 125=2 }
\let\c=\comment
@@ -296,6 +320,7 @@
\def\ignoresections{%
\let\chapter=\relax
\let\unnumbered=\relax
+\let\top=\relax
\let\unnumberedsec=\relax
\let\unnumberedsection=\relax
\let\unnumberedsubsec=\relax
@@ -314,10 +339,19 @@
\let\appendixsubsection=\relax
\let\appendixsubsubsec=\relax
\let\appendixsubsubsection=\relax
+\let\contents=\relax
+\let\smallbook=\relax
+\let\titlepage=\relax
}
-\def\ignore{\begingroup\ignoresections\ignorexxx}
-\long\def\ignorexxx #1\end ignore{\endgroup}
+\def\ignore{\begingroup\ignoresections
+% Make sure that spaces turn into tokens that match what \ignorexxx wants.
+\catcode32=10
+\ignorexxx}
+\long\def\ignorexxx #1\end ignore{\endgroup\ignorespaces}
+
+\def\direntry{\begingroup\direntryxxx}
+\long\def\direntryxxx #1\end direntry{\endgroup\ignorespaces}
% Conditionals to test whether a flag is set.
@@ -329,7 +363,7 @@
\temp}
\def\Eifset{}
\def\ifsetfail{\begingroup\ignoresections\ifsetfailxxx}
-\long\def\ifsetfailxxx #1\end ifset{\endgroup}
+\long\def\ifsetfailxxx #1\end ifset{\endgroup\ignorespaces}
\outer\def\ifclear{\begingroup\ignoresections\parsearg\ifclearxxx}
@@ -339,18 +373,41 @@
\temp}
\def\Eifclear{}
\def\ifclearfail{\begingroup\ignoresections\ifclearfailxxx}
-\long\def\ifclearfailxxx #1\end ifclear{\endgroup}
+\long\def\ifclearfailxxx #1\end ifclear{\endgroup\ignorespaces}
+
+% @set foo to set the flag named foo.
+% @clear foo to clear the flag named foo.
+\def\set{\parsearg\setxxx}
+\def\setxxx #1{
+\expandafter\let\csname IF#1\endcsname=\set}
+
+\def\clear{\parsearg\clearxxx}
+\def\clearxxx #1{
+\expandafter\let\csname IF#1\endcsname=\relax}
% Some texinfo constructs that are trivial in tex
\def\iftex{}
\def\Eiftex{}
\def\ifinfo{\begingroup\ignoresections\ifinfoxxx}
-\long\def\ifinfoxxx #1\end ifinfo{\endgroup}
+\long\def\ifinfoxxx #1\end ifinfo{\endgroup\ignorespaces}
\long\def\menu #1\end menu{}
\def\asis#1{#1}
+% @math means output in math mode.
+% We don't use $'s directly in the definition of \math because control
+% sequences like \math are expanded when the toc file is written. Then,
+% we read the toc file back, the $'s will be normal characters (as they
+% should be, according to the definition of Texinfo). So we must use a
+% control sequence to switch into and out of math mode.
+%
+% This isn't quite enough for @math to work properly in indices, but it
+% seems unlikely it will ever be needed there.
+%
+\let\implicitmath = $
+\def\math#1{\implicitmath #1\implicitmath}
+
\def\node{\ENVcheck\parsearg\nodezzz}
\def\nodezzz#1{\nodexxx [#1,]}
\def\nodexxx[#1,#2]{\gdef\lastnode{#1}}
@@ -378,86 +435,169 @@
\opencontents
\openindices
\fixbackslash % Turn off hack to swallow `\input texinfo'.
+ \global\let\setfilename=\comment % Ignore extra @setfilename cmds.
\comment % Ignore the actual filename.
}
\outer\def\bye{\pagealignmacro\tracingstats=1\ptexend}
\def\inforef #1{\inforefzzz #1,,,,**}
-\def\inforefzzz #1,#2,#3,#4**{See Info file \file{\losespace#3{}}, node `\losespace#1{}'}
+\def\inforefzzz #1,#2,#3,#4**{See Info file \file{\losespace#3{}},
+ node \samp{\losespace#1{}}}
\def\losespace #1{#1}
\message{fonts,}
% Font-change commands.
+% Texinfo supports the sans serif font style, which plain TeX does not.
+% So we set up a \sf analogous to plain's \rm, etc.
+\newfam\sffam
+\def\sf{\fam=\sffam \tensf}
+\let\li = \sf % Sometimes we call it \li, not \sf.
+
%% Try out Computer Modern fonts at \magstephalf
-\font\tenrm=cmr10 scaled \magstephalf
-\font\tentt=cmtt10 scaled \magstephalf
+\font\textrm=cmr10 scaled \magstephalf
+\font\texttt=cmtt10 scaled \magstephalf
% Instead of cmb10, you many want to use cmbx10.
% cmbx10 is a prettier font on its own, but cmb10
% looks better when embedded in a line with cmr10.
-\font\tenbf=cmb10 scaled \magstephalf
-\font\tenit=cmti10 scaled \magstephalf
-\font\tensl=cmsl10 scaled \magstephalf
-\font\tensf=cmss10 scaled \magstephalf
-\def\li{\sf}
-\font\tensc=cmcsc10 scaled \magstephalf
-
-% Fonts for @defun, etc.
+\font\textbf=cmb10 scaled \magstephalf
+\font\textit=cmti10 scaled \magstephalf
+\font\textsl=cmsl10 scaled \magstephalf
+\font\textsf=cmss10 scaled \magstephalf
+\font\textsc=cmcsc10 scaled \magstephalf
+\font\texti=cmmi10 scaled \magstephalf
+\font\textsy=cmsy10 scaled \magstephalf
+
+% A few fonts for @defun, etc.
\font\defbf=cmbx10 scaled \magstep1 %was 1314
-\let\deftt=\tentt
-\def\df{\let\tt=\deftt \defbf}
-
-% Font for title
-\font\titlerm = cmbx10 scaled \magstep5
-
-% Fonts for indices
-\font\indit=cmti9 \font\indrm=cmr9
-\font\indtt=cmtt9
-\def\indbf{\indrm} \def\indsl{\indit}
-\def\indexfonts{\let\it=\indit \let\sl=\indsl \let\bf=\indbf \let\rm=\indrm
-\let\tt=\indtt}
+\font\deftt=cmtt10 scaled \magstep1
+\def\df{\let\tentt=\deftt \let\tenbf = \defbf \bf}
+
+% Fonts for indices and small examples.
+% We actually use the slanted font rather than the italic,
+% because texinfo normally uses the slanted fonts for that.
+% Do not make many font distinctions in general in the index, since they
+% aren't very useful.
+\font\ninett=cmtt9
+\font\indrm=cmr9
+\font\indit=cmsl9
+\let\indsl=\indit
+\let\indtt=\ninett
+\let\indsf=\indrm
+\let\indbf=\indrm
+\let\indsc=\indrm
+\font\indi=cmmi9
+\font\indsy=cmsy9
% Fonts for headings
-\font\chaprm=cmbx10 scaled \magstep3
-\font\chapit=cmti10 scaled \magstep3
-\font\chapsl=cmsl10 scaled \magstep3
-\font\chaptt=cmtt10 scaled \magstep3
-\font\chapsf=cmss10 scaled \magstep3
+\font\chaprm=cmbx12 scaled \magstep2
+\font\chapit=cmti12 scaled \magstep2
+\font\chapsl=cmsl12 scaled \magstep2
+\font\chaptt=cmtt12 scaled \magstep2
+\font\chapsf=cmss12 scaled \magstep2
\let\chapbf=\chaprm
-
-\font\secrm=cmbx10 scaled \magstep2
-\font\secit=cmti10 scaled \magstep2
-\font\secsl=cmsl10 scaled \magstep2
-\font\sectt=cmtt10 scaled \magstep2
-\font\secsf=cmss10 scaled \magstep2
-\let\secbf=\secrm
-
-% \font\ssecrm=cmbx10 scaled \magstep1 % This size an fontlooked bad.
+\font\chapsc=cmcsc10 scaled\magstep3
+\font\chapi=cmmi12 scaled \magstep2
+\font\chapsy=cmsy10 scaled \magstep3
+
+\font\secrm=cmbx12 scaled \magstep1
+\font\secit=cmti12 scaled \magstep1
+\font\secsl=cmsl12 scaled \magstep1
+\font\sectt=cmtt12 scaled \magstep1
+\font\secsf=cmss12 scaled \magstep1
+\font\secbf=cmbx12 scaled \magstep1
+\font\secsc=cmcsc10 scaled\magstep2
+\font\seci=cmmi12 scaled \magstep1
+\font\secsy=cmsy10 scaled \magstep2
+
+% \font\ssecrm=cmbx10 scaled \magstep1 % This size an font looked bad.
% \font\ssecit=cmti10 scaled \magstep1 % The letters were too crowded.
% \font\ssecsl=cmsl10 scaled \magstep1
% \font\ssectt=cmtt10 scaled \magstep1
% \font\ssecsf=cmss10 scaled \magstep1
-\font\ssecrm=cmb10 at 13pt % Note the use of cmb rather than cmbx.
-\font\ssecit=cmti10 at 13pt % Also, the size is a little larger than
-\font\ssecsl=cmsl10 at 13pt % being scaled magstep1.
-\font\ssectt=cmtt10 at 13pt
-\font\ssecsf=cmss10 at 13pt
+%\font\ssecrm=cmb10 scaled 1315 % Note the use of cmb rather than cmbx.
+%\font\ssecit=cmti10 scaled 1315 % Also, the size is a little larger than
+%\font\ssecsl=cmsl10 scaled 1315 % being scaled magstep1.
+%\font\ssectt=cmtt10 scaled 1315
+%\font\ssecsf=cmss10 scaled 1315
+
+%\let\ssecbf=\ssecrm
+
+\font\ssecrm=cmbx12 scaled \magstephalf
+\font\ssecit=cmti12 scaled \magstephalf
+\font\ssecsl=cmsl12 scaled \magstephalf
+\font\ssectt=cmtt12 scaled \magstephalf
+\font\ssecsf=cmss12 scaled \magstephalf
+\font\ssecbf=cmbx12 scaled \magstephalf
+\font\ssecsc=cmcsc10 scaled \magstep1
+\font\sseci=cmmi12 scaled \magstephalf
+\font\ssecsy=cmsy10 scaled \magstep1
+% The smallcaps and symbol fonts should actually be scaled \magstep1.5,
+% but that is not a standard magnification.
+
+% Fonts for title page:
+\font\titlerm = cmbx12 scaled \magstep3
+\let\authorrm = \secrm
+
+% In order for the font changes to affect most math symbols and letters,
+% we have to define the \textfont of the standard families. Since
+% texinfo doesn't allow for producing subscripts and superscripts, we
+% don't bother to reset \scriptfont and \scriptscriptfont (which would
+% also require loading a lot more fonts).
+%
+\def\resetmathfonts{%
+ \textfont0 = \tenrm \textfont1 = \teni \textfont2 = \tensy
+ \textfont\itfam = \tenit \textfont\slfam = \tensl \textfont\bffam = \tenbf
+ \textfont\ttfam = \tentt \textfont\sffam = \tensf
+}
+
-\let\ssecbf=\ssecrm
+% The font-changing commands redefine the meanings of \tenSTYLE, instead
+% of just \STYLE. We do this so that font changes will continue to work
+% in math mode, where it is the current \fam that is relevant in most
+% cases, not the current. Plain TeX does, for example,
+% \def\bf{\fam=\bffam \tenbf} By redefining \tenbf, we obviate the need
+% to redefine \bf itself.
+\def\textfonts{%
+ \let\tenrm=\textrm \let\tenit=\textit \let\tensl=\textsl
+ \let\tenbf=\textbf \let\tentt=\texttt \let\smallcaps=\textsc
+ \let\tensf=\textsf \let\teni=\texti \let\tensy=\textsy
+ \resetmathfonts}
+\def\chapfonts{%
+ \let\tenrm=\chaprm \let\tenit=\chapit \let\tensl=\chapsl
+ \let\tenbf=\chapbf \let\tentt=\chaptt \let\smallcaps=\chapsc
+ \let\tensf=\chapsf \let\teni=\chapi \let\tensy=\chapsy
+ \resetmathfonts}
+\def\secfonts{%
+ \let\tenrm=\secrm \let\tenit=\secit \let\tensl=\secsl
+ \let\tenbf=\secbf \let\tentt=\sectt \let\smallcaps=\secsc
+ \let\tensf=\secsf \let\teni=\seci \let\tensy=\secsy
+ \resetmathfonts}
+\def\subsecfonts{%
+ \let\tenrm=\ssecrm \let\tenit=\ssecit \let\tensl=\ssecsl
+ \let\tenbf=\ssecbf \let\tentt=\ssectt \let\smallcaps=\ssecsc
+ \let\tensf=\ssecsf \let\teni=\sseci \let\tensy=\ssecsy
+ \resetmathfonts}
+\def\indexfonts{%
+ \let\tenrm=\indrm \let\tenit=\indit \let\tensl=\indsl
+ \let\tenbf=\indbf \let\tentt=\indtt \let\smallcaps=\indsc
+ \let\tensf=\indsf \let\teni=\indi \let\tensy=\indsy
+ \resetmathfonts}
+
+% Set up the default fonts, so we can use them for creating boxes.
+%
+\textfonts
-\def\textfonts{\let\rm=\tenrm\let\it=\tenit\let\sl=\tensl\let\bf=\tenbf%
-\let\smallcaps=\tensc\let\sf=\tensf}
-\def\chapfonts{\let\rm=\chaprm\let\it=\chapit\let\sl=\chapsl\let\bf=\chapbf\let\tt=\chaptt\let\sf=\chapsf}
-\def\secfonts{\let\rm=\secrm\let\it=\secit\let\sl=\secsl\let\bf=\secbf\let\tt=\sectt\let\sf=\secsf}
-\def\subsecfonts{\let\rm=\ssecrm\let\it=\ssecit\let\sl=\ssecsl\let\bf=\ssecbf\let\tt=\ssectt\let\sf=\ssecsf}
% Count depth in font-changes, for error checks
\newcount\fontdepth \fontdepth=0
-% Font for table of contents.
-\font\truesecrm=cmr12
+% Fonts for short table of contents.
+\font\shortcontrm=cmr12
+\font\shortcontbf=cmbx12
+\font\shortcontsl=cmsl12
%% Add scribe-like font environments, plus @l for inline lisp (usually sans
%% serif) and @ii for TeX italic
@@ -476,11 +616,11 @@
\def\b#1{{\bf #1}}
\let\strong=\b
-\def\t#1{{\tt \rawbackslash \frenchspacing #1}\null}
+\def\t#1{{\tt \exhyphenpenalty=10000\rawbackslash \frenchspacing #1}\null}
\let\ttfont = \t
%\def\samp #1{`{\tt \rawbackslash \frenchspacing #1}'\null}
\def\samp #1{`\tclose{#1}'\null}
-\def\key #1{{\tt \uppercase{#1}}\null}
+\def\key #1{{\tt \exhyphenpenalty=10000\uppercase{#1}}\null}
\def\ctrl #1{{\tt \rawbackslash \hat}#1}
\let\file=\samp
@@ -491,6 +631,8 @@
\newdimen\tcloserm
\def\tclose#1{{\rm \tcloserm=\fontdimen2\font \tt \tclosesave=\fontdimen2\font
\fontdimen2\font=\tcloserm
+% prevent breaking lines at hyphens.
+\exhyphenpenalty=10000
\def\ {{\fontdimen2\font=\tclosesave{} }}%
\rawbackslash \frenchspacing #1\fontdimen2\font=\tclosesave}\null}
\let\code=\tclose
@@ -500,17 +642,24 @@
% then @kbd has no effect.
\def\xkey{\key}
-\def\kbdfoo#1#2#3*{\def\one{#1}\def\three{#3}\def\threex{??}%
-\ifx\one\xkey\ifx\threex\three \key{#2}
+\def\kbdfoo#1#2#3\par{\def\one{#1}\def\three{#3}\def\threex{??}%
+\ifx\one\xkey\ifx\threex\three \key{#2}%
\else\tclose{\look}\fi
\else\tclose{\look}\fi}
-\def\kbd#1{\def\look{#1}\expandafter\kbdfoo\look??*}
+% Typeset a dimension, e.g., `in' or `pt'. The only reason for the
+% argument is to make the input look right: @dmn{pt} instead of
+% @dmn{}pt.
+%
+\def\dmn#1{\thinspace #1}
+
+\def\kbd#1{\def\look{#1}\expandafter\kbdfoo\look??\par}
\def\l#1{{\li #1}\null} %
\def\r#1{{\rm #1}} % roman font
-\def\sc#1{{\smallcaps #1}} % smallcaps font
+% Use of \lowercase was suggested.
+\def\sc#1{{\smallcaps#1}} % smallcaps font
\def\ii#1{{\it #1}} % italic font
\message{page headings,}
@@ -519,33 +668,32 @@
\newskip\titlepagebottomglue \titlepagebottomglue = 2pc
% First the title page. Must do @settitle before @titlepage.
-\font\titlerm = cmbx12 scaled \magstep2
\def\titlefont#1{{\titlerm #1}}
\newtoks\realeverypar
\newif\ifseenauthor
+\newif\iffinishedtitlepage
\def\titlepage{\begingroup \parindent=0pt \textfonts
- \font\subtitlerm = cmr10 scaled \magstephalf
- \def\subtitlefont{\subtitlerm \normalbaselineskip = 12pt \normalbaselines}%
+ \let\subtitlerm=\tenrm
+% I deinstalled the following change because \cmr12 is undefined.
+% This change was not in the ChangeLog anyway. --rms.
+% \let\subtitlerm=\cmr12
+ \def\subtitlefont{\subtitlerm \normalbaselineskip = 13pt \normalbaselines}%
%
- \font\authorrm = cmbx12 scaled \magstep1
\def\authorfont{\authorrm \normalbaselineskip = 16pt \normalbaselines}%
%
- % The first subtitle should have some space before it, but not the
- % others. They all should be ragged left.
-% This code caused a bug, since two groups were started, but only
-% one was ended. Also, I can't see the point of this code.
-% \begingroup \realeverypar = {\leftskip = 2in plus 3em minus 1em
-% \parfillskip = 0pt}%
-% \everypar = {\vglue \baselineskip \the\realeverypar
-% \everypar={\the\realeverypar}}%
+ % Leave some space at the very top of the page.
+ \vglue\titlepagetopglue
%
% Now you can print the title using @title.
\def\title{\parsearg\titlezzz}%
- \def\titlezzz##1{\leftline{\titlefont{##1}
- \vskip4pt \hrule height 4pt \vskip4pt}}%
- \vglue\titlepagetopglue
+ \def\titlezzz##1{\leftline{\titlefont{##1}}
+ % print a rule at the page bottom also.
+ \finishedtitlepagefalse
+ \vskip4pt \hrule height 4pt \vskip4pt}%
+ % No rule at page bottom unless we print one at the top with @title.
+ \finishedtitlepagetrue
%
% Now you can put text using @subtitle.
\def\subtitle{\parsearg\subtitlezzz}%
@@ -559,12 +707,34 @@
% Most title ``pages'' are actually two pages long, with space
% at the top of the second. We don't want the ragged left on the second.
\let\oldpage = \page
-% \def\page{\vskip4pt \hrule height 2pt \vskip\titlepagebottomglue
-% \oldpage \endgroup\hrule height0pt\relax}%
- \def\page{\oldpage \hbox{}}
+ \def\page{%
+ \iffinishedtitlepage\else
+ \finishtitlepage
+ \fi
+ \oldpage
+ \let\page = \oldpage
+ \hbox{}}%
+% \def\page{\oldpage \hbox{}}
}
-\def\Etitlepage{\endgroup\page\HEADINGSon}
+\def\Etitlepage{%
+ \iffinishedtitlepage\else
+ \finishtitlepage
+ \fi
+ % It is important to do the page break before ending the group,
+ % because the headline and footline are only empty inside the group.
+ % If we use the new definition of \page, we always get a blank page
+ % after the title page, which we certainly don't want.
+ \oldpage
+ \endgroup
+ \HEADINGSon
+}
+
+\def\finishtitlepage{%
+ \vskip4pt \hrule height 2pt
+ \vskip\titlepagebottomglue
+ \finishedtitlepagetrue
+}
%%% Set up page headings and footings.
@@ -576,8 +746,11 @@
\newtoks \oddfootline % Token sequence for footing line of odd pages
% Now make Tex use those variables
-\headline={{\textfonts\rm \ifodd\pageno \the\oddheadline \else \the\evenheadline \fi}}
-\footline={{\textfonts\rm \ifodd\pageno \the\oddfootline \else \the\evenfootline \fi}}
+\headline={{\textfonts\rm \ifodd\pageno \the\oddheadline
+ \else \the\evenheadline \fi}}
+\footline={{\textfonts\rm \ifodd\pageno \the\oddfootline
+ \else \the\evenfootline \fi}\HEADINGShook}
+\let\HEADINGShook=\relax
% Commands to set those variables.
% For example, this is what @headings on does
@@ -628,6 +801,9 @@
% @headings single turns headings on for single-sided printing.
% @headings off turns them off.
% @headings on same as @headings double, retained for compatibility.
+% @headings after turns on double-sided headings after this page.
+% @headings doubleafter turns on double-sided headings after this page.
+% @headings singleafter turns on single-sided headings after this page.
% By default, they are off.
\def\headings #1 {\csname HEADINGS#1\endcsname}
@@ -661,6 +837,23 @@
}
\def\HEADINGSon{\HEADINGSdouble}
+\def\HEADINGSafter{\let\HEADINGShook=\HEADINGSdoublex}
+\let\HEADINGSdoubleafter=\HEADINGSafter
+\def\HEADINGSdoublex{%
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\folio\hfil\thistitle}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+}
+
+\def\HEADINGSsingleafter{\let\HEADINGShook=\HEADINGSsinglex}
+\def\HEADINGSsinglex{%
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\thischapter\hfil\folio}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+}
+
% Subroutines used in generating headings
% Produces Day Month Year style of output.
\def\today{\number\day\space
@@ -684,6 +877,17 @@ July\or August\or September\or October\or November\or December\fi
\message{tables,}
+% @tabs -- simple alignment
+
+% These don't work. For one thing, \+ is defined as outer.
+% So these macros cannot even be defined.
+
+%\def\tabs{\parsearg\tabszzz}
+%\def\tabszzz #1{\settabs\+#1\cr}
+%\def\tabline{\parsearg\tablinezzz}
+%\def\tablinezzz #1{\+#1\cr}
+%\def\&{&}
+
% Tables -- @table, @ftable, @item(x), @kitem(x), @xitem(x).
% default indentation of table text
@@ -709,9 +913,11 @@ July\or August\or September\or October\or November\or December\fi
\def\internalBkitem{\smallbreak \parsearg\kitemzzz}
\def\internalBkitemx{\par \parsearg\kitemzzz}
-\def\kitemzzz #1{\dosubind {kw}{\code{#1}}{for {\bf \lastfunction}}\itemzzz {#1}}
+\def\kitemzzz #1{\dosubind {kw}{\code{#1}}{for {\bf \lastfunction}}%
+ \itemzzz {#1}}
-\def\xitemzzz #1{\dosubind {kw}{\code{#1}}{for {\bf \xitemsubtopic}}\itemzzz {#1}}
+\def\xitemzzz #1{\dosubind {kw}{\code{#1}}{for {\bf \xitemsubtopic}}%
+ \itemzzz {#1}}
\def\itemzzz #1{\begingroup %
\advance \hsize by -\rightskip %
@@ -747,7 +953,9 @@ July\or August\or September\or October\or November\or December\fi
\def\ftable{\begingroup\inENV\obeylines\obeyspaces\ftablex}
{\obeylines\obeyspaces%
\gdef\ftablex #1^^M{%
-\tabley\fnitemindex#1 \endtabley}}
+\tabley\fnitemindex#1 \endtabley
+\def\Eftable{\endgraf\endgroup\afterenvbreak}%
+\let\Etable=\relax}}
\def\dontindex #1{}
\def\fnitemindex #1{\doind {fn}{\code{#1}}}%
@@ -810,7 +1018,25 @@ July\or August\or September\or October\or November\or December\fi
\def\frenchspacing{\sfcode46=1000 \sfcode63=1000 \sfcode33=1000
\sfcode58=1000 \sfcode59=1000 \sfcode44=1000 }
-\def\enumerate{\itemizey{\the\itemno.}\Eenumerate\flushcr}
+% Allow argument of `a', `A' or `1' to specify type of enumeration.
+\def\enumerate{\parsearg\enumeratezzz}
+\def\enumeratezzz #1{\enumeratey #1 \endenumeratey}
+\def\enumeratey #1 #2\endenumeratey{
+\if#1a \alphaenumerate\else\if#1A \capsenumerate\else
+\itemizey{\the\itemno.}\Eenumerate\flushcr
+\fi\fi}
+
+\def\alphaenumerate{\itemizey{\ifcase\itemno\or
+a\or b\or c\or d\or e\or f\or g\or h\or i\or j\or k\or l\or m\or n\or o\or
+p\or q\or r\or s\or t\or u\or v\or w\or x\or y\or z\else
+\errmessage{More than 26 items in @alphaenumerate; get a bigger alphabet.}\fi.}%
+\Ealphaenumerate\flushcr}
+
+\def\capsenumerate{\itemizey{\ifcase\itemno\or
+A\or B\or C\or D\or E\or F\or G\or H\or I\or J\or K\or L\or M\or N\or O\or
+P\or Q\or R\or S\or T\or U\or V\or W\or X\or Y\or Z\else
+\errmessage{More than 26 items in @capsenumerate; get a bigger alphabet.}\fi.}%
+\Ecapsenumerate\flushcr}
% Definition of @item while inside @itemize.
@@ -864,6 +1090,8 @@ July\or August\or September\or October\or November\or December\fi
% @synindex foo bar makes index foo feed into index bar.
% Do this instead of @defindex foo if you don't want it as a separate index.
\def\synindex #1 #2 {%
+\expandafter\let\expandafter\synindexfoo\expandafter=\csname#2indfile\endcsname
+\expandafter\let\csname#1indfile\endcsname=\synindexfoo
\expandafter\xdef\csname#1index\endcsname{% % Define \xxxindex
\noexpand\doindex {#2}}%
}
@@ -871,6 +1099,8 @@ July\or August\or September\or October\or November\or December\fi
% @syncodeindex foo bar similar, but put all entries made for index foo
% inside @code.
\def\syncodeindex #1 #2 {%
+\expandafter\let\expandafter\synindexfoo\expandafter=\csname#2indfile\endcsname
+\expandafter\let\csname#1indfile\endcsname=\synindexfoo
\expandafter\xdef\csname#1index\endcsname{% % Define \xxxindex
\noexpand\docodeindex {#2}}%
}
@@ -893,14 +1123,24 @@ July\or August\or September\or October\or November\or December\fi
\def\singlecodeindexer #1{\doind{\indexname}{\code{#1}}}
\def\indexdummies{%
+\def\_{{\realbackslash _}}%
+\def\w{\realbackslash w }%
\def\bf{\realbackslash bf }%
\def\rm{\realbackslash rm }%
\def\sl{\realbackslash sl }%
+\def\sf{\realbackslash sf}%
+\def\tt{\realbackslash tt}%
+\def\gtr{\realbackslash gtr}%
+\def\less{\realbackslash less}%
+\def\hat{\realbackslash hat}%
+\def\char{\realbackslash char}%
+\def\TeX{\realbackslash TeX}%
\def\dots{\realbackslash dots }%
\def\copyright{\realbackslash copyright }%
\def\tclose##1{\realbackslash tclose {##1}}%
\def\code##1{\realbackslash code {##1}}%
\def\samp##1{\realbackslash samp {##1}}%
+\def\t##1{\realbackslash r {##1}}%
\def\r##1{\realbackslash r {##1}}%
\def\i##1{\realbackslash i {##1}}%
\def\b##1{\realbackslash b {##1}}%
@@ -914,7 +1154,12 @@ July\or August\or September\or October\or November\or December\fi
% \indexnofonts no-ops all font-change commands.
% This is used when outputting the strings to sort the index by.
\def\indexdummyfont#1{#1}
+\def\indexdummytex{TeX}
+\def\indexdummydots{...}
+
\def\indexnofonts{%
+\let\w=\indexdummyfont
+\let\t=\indexdummyfont
\let\r=\indexdummyfont
\let\i=\indexdummyfont
\let\b=\indexdummyfont
@@ -923,7 +1168,7 @@ July\or August\or September\or October\or November\or December\fi
\let\cite=\indexdummyfont
\let\sc=\indexdummyfont
%Don't no-op \tt, since it isn't a user-level command
-% and is used in the definitions of the actuve chars like <, >, |...
+% and is used in the definitions of the active chars like <, >, |...
%\let\tt=\indexdummyfont
\let\tclose=\indexdummyfont
\let\code=\indexdummyfont
@@ -932,6 +1177,8 @@ July\or August\or September\or October\or November\or December\fi
\let\kbd=\indexdummyfont
\let\key=\indexdummyfont
\let\var=\indexdummyfont
+\let\TeX=\indexdummytex
+\let\dots=\indexdummydots
}
% To define \realbackslash, we must make \ not be an escape.
@@ -944,8 +1191,8 @@ July\or August\or September\or October\or November\or December\fi
\let\indexbackslash=0 %overridden during \printindex.
\def\doind #1#2{%
+{\count10=\lastpenalty %
{\indexdummies % Must do this here, since \bf, etc expand at this stage
-\count10=\lastpenalty %
\escapechar=`\\%
{\let\folio=0% Expand all macros now EXCEPT \folio
\def\rawbackslashxx{\indexbackslash}% \indexbackslash isn't defined now
@@ -962,11 +1209,11 @@ July\or August\or September\or October\or November\or December\fi
\write \csname#1indfile\endcsname{%
\realbackslash entry {\temp1}{\folio}{#2}}}%
\temp }%
-\penalty\count10}}
+}\penalty\count10}}
\def\dosubind #1#2#3{%
+{\count10=\lastpenalty %
{\indexdummies % Must do this here, since \bf, etc expand at this stage
-\count10=\lastpenalty %
\escapechar=`\\%
{\let\folio=0%
\def\rawbackslashxx{\indexbackslash}%
@@ -982,7 +1229,7 @@ July\or August\or September\or October\or November\or December\fi
\write \csname#1indfile\endcsname{%
\realbackslash entry {\temp1}{\folio}{#2}{#3}}}%
\temp }%
-\penalty\count10}}
+}\penalty\count10}}
% The index entry written in the file actually looks like
% \entry {sortstring}{page}{topic}
@@ -1024,6 +1271,7 @@ July\or August\or September\or October\or November\or December\fi
\def\printindex{\parsearg\doprintindex}
\def\doprintindex#1{\tex %
+\dobreak \chapheadingskip {10000}
\catcode`\%=\other\catcode`\&=\other\catcode`\#=\other
\catcode`\$=\other\catcode`\_=\other
\catcode`\~=\other
@@ -1040,7 +1288,12 @@ July\or August\or September\or October\or November\or December\fi
\indexfonts\rm \tolerance=9500 \advance\baselineskip -1pt
\begindoublecolumns
\openin 1 \jobname.#1s
-\ifeof 1 \else \closein 1 \input \jobname.#1s
+\ifeof 1
+% \enddoublecolumns gets confused if there is no text in the index,
+% and it loses the chapter title and the aux file entries for the index.
+% The easiest way to prevent this problem is to make sure there is some text.
+(Index is empty)
+\else \closein 1 \input \jobname.#1s
\fi
\enddoublecolumns
\Etex}
@@ -1053,7 +1306,7 @@ July\or August\or September\or October\or November\or December\fi
\newskip\initialskipamount \initialskipamount 12pt plus4pt
\outer\def\initial #1{%
-{\let\tentt=\sectt \let\sf=\sectt
+{\let\tentt=\sectt \let\tt=\sectt \let\sf=\sectt
\ifdim\lastskip<\initialskipamount
\removelastskip \penalty-200 \vskip \initialskipamount\fi
\line{\secbf#1\hfill}\kern 2pt\penalty10000}}
@@ -1061,9 +1314,13 @@ July\or August\or September\or October\or November\or December\fi
\outer\def\entry #1#2{
{\parfillskip=0in \parskip=0in \parindent=0in
\hangindent=1in \hangafter=1%
-\noindent\hbox{#1}\dotfill #2\par
+\noindent\hbox{#1}\indexdotfill #2\par
}}
+% Like \dotfill except takes at least 1 em.
+\def\indexdotfill{\cleaders
+ \hbox{$\mathsurround=0pt \mkern1.5mu . \mkern1.5mu$}\hskip 1em plus 1fill}
+
\def\primary #1{\line{#1\hfil}}
\newskip\secondaryindent \secondaryindent=0.5cm
@@ -1071,7 +1328,7 @@ July\or August\or September\or October\or November\or December\fi
\def\secondary #1#2{
{\parfillskip=0in \parskip=0in
\hangindent =1in \hangafter=1
-\noindent\hskip\secondaryindent\hbox{#1}\dotfill #2\par
+\noindent\hskip\secondaryindent\hbox{#1}\indexdotfill #2\par
}}
%% Define two-column mode, which is used in indexes.
@@ -1085,8 +1342,10 @@ July\or August\or September\or October\or November\or December\fi
\newdimen\availdimen@
\def\begindoublecolumns{\begingroup
- \output={\global\setbox\partialpage=\vbox{\unvbox255\kern -\topskip \kern \baselineskip}}\eject
- \output={\doublecolumnout} \hsize=\doublecolumnhsize \vsize=\doublecolumnvsize}
+ \output={\global\setbox\partialpage=
+ \vbox{\unvbox255\kern -\topskip \kern \baselineskip}}\eject
+ \output={\doublecolumnout}%
+ \hsize=\doublecolumnhsize \vsize=\doublecolumnvsize}
\def\enddoublecolumns{\output={\balancecolumns}\eject
\endgroup \pagegoal=\vsize}
@@ -1132,9 +1391,9 @@ July\or August\or September\or October\or November\or December\fi
% Define chapters, sections, etc.
\newcount \chapno
-\newcount \secno
-\newcount \subsecno
-\newcount \subsubsecno
+\newcount \secno \secno=0
+\newcount \subsecno \subsecno=0
+\newcount \subsubsecno \subsubsecno=0
% This counter is funny since it counts through charcodes of letters A, B, ...
\newcount \appendixno \appendixno = `\@
@@ -1156,44 +1415,72 @@ July\or August\or September\or October\or November\or December\fi
\def\chapternofonts{%
\let\rawbackslash=\relax%
\let\frenchspacing=\relax%
+\def\TeX{\realbackslash TeX}
+\def\dots{\realbackslash dots}
+\def\copyright{\realbackslash copyright}
+\def\tt{\realbackslash tt}
+\def\bf{\realbackslash bf }
+\def\w{\realbackslash w}
+\def\less{\realbackslash less}
+\def\gtr{\realbackslash gtr}
+\def\hat{\realbackslash hat}
\def\char{\realbackslash char}
\def\tclose##1{\realbackslash tclose {##1}}
\def\code##1{\realbackslash code {##1}}
\def\samp##1{\realbackslash samp {##1}}
\def\r##1{\realbackslash r {##1}}
-\def\i##1{\realbackslash i {##1}}
\def\b##1{\realbackslash b {##1}}
-\def\cite##1{\realbackslash cite {##1}}
\def\key##1{\realbackslash key {##1}}
\def\file##1{\realbackslash file {##1}}
-\def\var##1{\realbackslash var {##1}}
\def\kbd##1{\realbackslash kbd {##1}}
+% These are redefined because @smartitalic wouldn't work inside xdef.
+\def\i##1{\realbackslash i {##1}}
+\def\cite##1{\realbackslash cite {##1}}
+\def\var##1{\realbackslash var {##1}}
+\def\emph##1{\realbackslash emph {##1}}
+\def\dfn##1{\realbackslash dfn {##1}}
}
\outer\def\chapter{\parsearg\chapterzzz}
\def\chapterzzz #1{\seccheck{chapter}%
-\secno=0 \subsecno=0 \subsubsecno=0 \global\advance \chapno by 1 \message{Chapter \the\chapno}%
+\secno=0 \subsecno=0 \subsubsecno=0
+\global\advance \chapno by 1 \message{Chapter \the\chapno}%
\chapmacro {#1}{\the\chapno}%
-\gdef\thissection{#1}\gdef\thischapter{#1}%
+\gdef\thissection{#1}%
+\gdef\thischaptername{#1}%
+% We don't substitute the actual chapter name into \thischapter
+% because we don't want its macros evaluated now.
+\xdef\thischapter{Chapter \the\chapno: \noexpand\thischaptername}%
{\chapternofonts%
\edef\temp{{\realbackslash chapentry {#1}{\the\chapno}{\noexpand\folio}}}%
\escapechar=`\\%
\write \contentsfile \temp %
\donoderef %
+\global\let\section = \numberedsec
+\global\let\subsection = \numberedsubsec
+\global\let\subsubsection = \numberedsubsubsec
}}
\outer\def\appendix{\parsearg\appendixzzz}
\def\appendixzzz #1{\seccheck{appendix}%
-\secno=0 \subsecno=0 \subsubsecno=0 \global\advance \appendixno by 1 \message{Appendix \appendixletter}%
+\secno=0 \subsecno=0 \subsubsecno=0
+\global\advance \appendixno by 1 \message{Appendix \appendixletter}%
\chapmacro {#1}{Appendix \appendixletter}%
-\gdef\thischapter{#1}\gdef\thissection{#1}%
+\gdef\thissection{#1}%
+\gdef\thischaptername{#1}%
+\xdef\thischapter{Appendix \appendixletter: \noexpand\thischaptername}%
{\chapternofonts%
-\edef\temp{{\realbackslash chapentry {#1}{Appendix \appendixletter}{\noexpand\folio}}}%
+\edef\temp{{\realbackslash chapentry
+ {#1}{Appendix \appendixletter}{\noexpand\folio}}}%
\escapechar=`\\%
\write \contentsfile \temp %
\appendixnoderef %
+\global\let\section = \appendixsec
+\global\let\subsection = \appendixsubsec
+\global\let\subsubsection = \appendixsubsubsec
}}
+\outer\def\top{\parsearg\unnumberedzzz}
\outer\def\unnumbered{\parsearg\unnumberedzzz}
\def\unnumberedzzz #1{\seccheck{unnumbered}%
\secno=0 \subsecno=0 \subsubsecno=0 \message{(#1)}
@@ -1204,10 +1491,13 @@ July\or August\or September\or October\or November\or December\fi
\escapechar=`\\%
\write \contentsfile \temp %
\unnumbnoderef %
+\global\let\section = \unnumberedsec
+\global\let\subsection = \unnumberedsubsec
+\global\let\subsubsection = \unnumberedsubsubsec
}}
-\outer\def\section{\parsearg\sectionzzz}
-\def\sectionzzz #1{\seccheck{section}%
+\outer\def\numberedsec{\parsearg\seczzz}
+\def\seczzz #1{\seccheck{section}%
\subsecno=0 \subsubsecno=0 \global\advance \secno by 1 %
\gdef\thissection{#1}\secheading {#1}{\the\chapno}{\the\secno}%
{\chapternofonts%
@@ -1244,8 +1534,8 @@ July\or August\or September\or October\or November\or December\fi
\penalty 10000 %
}}
-\outer\def\subsection{\parsearg\subsectionzzz}
-\def\subsectionzzz #1{\seccheck{subsection}%
+\outer\def\numberedsubsec{\parsearg\numberedsubseczzz}
+\def\numberedsubseczzz #1{\seccheck{subsection}%
\gdef\thissection{#1}\subsubsecno=0 \global\advance \subsecno by 1 %
\subsecheading {#1}{\the\chapno}{\the\secno}{\the\subsecno}%
{\chapternofonts%
@@ -1281,13 +1571,16 @@ July\or August\or September\or October\or November\or December\fi
\penalty 10000 %
}}
-\outer\def\subsubsection{\parsearg\subsubsectionzzz}
-\def\subsubsectionzzz #1{\seccheck{subsubsection}%
+\outer\def\numberedsubsubsec{\parsearg\numberedsubsubseczzz}
+\def\numberedsubsubseczzz #1{\seccheck{subsubsection}%
\gdef\thissection{#1}\global\advance \subsubsecno by 1 %
-\subsubsecheading {#1}{\the\chapno}{\the\secno}{\the\subsecno}{\the\subsubsecno}%
+\subsubsecheading {#1}
+ {\the\chapno}{\the\secno}{\the\subsecno}{\the\subsubsecno}%
{\chapternofonts%
\edef\temp{{\realbackslash subsubsecentry %
-{#1}{\the\chapno}{\the\secno}{\the\subsecno}{\the\subsubsecno}{\noexpand\folio}}}%\
+ {#1}
+ {\the\chapno}{\the\secno}{\the\subsecno}{\the\subsubsecno}
+ {\noexpand\folio}}}%
\escapechar=`\\%
\write \contentsfile \temp %
\donoderef %
@@ -1297,10 +1590,12 @@ July\or August\or September\or October\or November\or December\fi
\outer\def\appendixsubsubsec{\parsearg\appendixsubsubseczzz}
\def\appendixsubsubseczzz #1{\seccheck{appendixsubsubsec}%
\gdef\thissection{#1}\global\advance \subsubsecno by 1 %
-\subsubsecheading {#1}{\appendixletter}{\the\secno}{\the\subsecno}{\the\subsubsecno}%
+\subsubsecheading {#1}
+ {\appendixletter}{\the\secno}{\the\subsecno}{\the\subsubsecno}%
{\chapternofonts%
\edef\temp{{\realbackslash subsubsecentry{#1}%
-{\appendixletter}{\the\secno}{\the\subsecno}{\the\subsubsecno}{\noexpand\folio}}}%\
+ {\appendixletter}
+ {\the\secno}{\the\subsecno}{\the\subsubsecno}{\noexpand\folio}}}%
\escapechar=`\\%
\write \contentsfile \temp %
\appendixnoderef %
@@ -1319,6 +1614,8 @@ July\or August\or September\or October\or November\or December\fi
}}
% These are variants which are not "outer", so they can appear in @ifinfo.
+% Actually, they should now be obsolete; ordinary section commands should work.
+\def\infotop{\parsearg\unnumberedzzz}
\def\infounnumbered{\parsearg\unnumberedzzz}
\def\infounnumberedsec{\parsearg\unnumberedseczzz}
\def\infounnumberedsubsec{\parsearg\unnumberedsubseczzz}
@@ -1334,17 +1631,44 @@ July\or August\or September\or October\or November\or December\fi
\def\infosubsection{\parsearg\subsectionzzz}
\def\infosubsubsection{\parsearg\subsubsectionzzz}
+% These macros control what the section commands do, according
+% to what kind of chapter we are in (ordinary, appendix, or unnumbered).
+% Define them by default for a numbered chapter.
+\global\let\section = \numberedsec
+\global\let\subsection = \numberedsubsec
+\global\let\subsubsection = \numberedsubsubsec
+
% Define @majorheading, @heading and @subheading
-\def\majorheading #1{%
+% NOTE on use of \vbox for chapter headings, section headings, and
+% such:
+% 1) We use \vbox rather than the earlier \line to permit
+% overlong headings to fold.
+% 2) \hyphenpenalty is set to 10000 because hyphenation in a
+% heading is obnoxious; this forbids it.
+% 3) Likewise, headings look best if no \parindent is used, and
+% if justification is not attempted. Hence \raggedright.
+
+
+\def\majorheading{\parsearg\majorheadingzzz}
+\def\majorheadingzzz #1{%
{\advance\chapheadingskip by 10pt \chapbreak }%
-{\chapfonts \line{\rm #1\hfill}}\bigskip \par\penalty 200}
+{\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}\bigskip \par\penalty 200}
-\def\chapheading #1{\chapbreak %
-{\chapfonts \line{\rm #1\hfill}}\bigskip \par\penalty 200}
+\def\chapheading{\parsearg\chapheadingzzz}
+\def\chapheadingzzz #1{\chapbreak %
+{\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}\bigskip \par\penalty 200}
\def\heading{\parsearg\secheadingi}
+\def\subheading{\parsearg\subsecheadingi}
+
+\def\subsubheading{\parsearg\subsubsecheadingi}
+
% These macros generate a chapter, section, etc. heading only
% (including whitespace, linebreaking, etc. around it),
% given all the information in convenient, parsed form.
@@ -1386,18 +1710,28 @@ July\or August\or September\or October\or November\or December\fi
\global\let\unnumbchapmacro=\unnchfplain}
\def\chfplain #1#2{%
-\pchapsepmacro %
-{\chapfonts \line{\rm #2.\enspace #1\hfill}}\bigskip \par\penalty 5000 %
+ \pchapsepmacro
+ {%
+ \chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #2\enspace #1}%
+ }%
+ \bigskip
+ \penalty5000
}
\def\unnchfplain #1{%
\pchapsepmacro %
-{\chapfonts \line{\rm #1\hfill}}\bigskip \par\penalty 10000 %
+{\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}\bigskip \par\penalty 10000 %
}
\CHAPFplain % The default
\def\unnchfopen #1{%
-\chapoddpage {\chapfonts \line{\rm #1\hfill}}\bigskip \par\penalty 10000 %
+\chapoddpage {\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}\bigskip \par\penalty 10000 %
}
\def\chfopen #1#2{\chapoddpage {\chapfonts
@@ -1425,24 +1759,32 @@ July\or August\or September\or October\or November\or December\fi
\def\plainsecheading #1{\secheadingi {#1}}
\def\secheadingi #1{{\advance \secheadingskip by \parskip %
\secheadingbreak}%
-{\secfonts \line{\rm #1\hfill}}%
+{\secfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}%
\ifdim \parskip<10pt \kern 10pt\kern -\parskip\fi \penalty 10000 }
% Subsection fonts are the base font at magstep1,
% which produces a size of 12 points.
-\def\subsecheading #1#2#3#4{{\advance \subsecheadingskip by \parskip %
+\def\subsecheading #1#2#3#4{\subsecheadingi {#2.#3.#4\enspace #1}}
+\def\subsecheadingi #1{{\advance \subsecheadingskip by \parskip %
\subsecheadingbreak}%
-{\subsecfonts \line{\rm#2.#3.#4\enspace #1\hfill}}%
+{\subsecfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}%
\ifdim \parskip<10pt \kern 10pt\kern -\parskip\fi \penalty 10000 }
\def\subsubsecfonts{\subsecfonts} % Maybe this should change:
% Perhaps make sssec fonts scaled
% magstep half
-\def\subsubsecheading #1#2#3#4#5{{\advance \subsecheadingskip by \parskip %
+\def\subsubsecheading #1#2#3#4#5{\subsubsecheadingi {#2.#3.#4.#5\enspace #1}}
+\def\subsubsecheadingi #1{{\advance \subsecheadingskip by \parskip %
\subsecheadingbreak}%
-{\subsubsecfonts \line{\rm#2.#3.#4.#5\enspace #1\hfill}}%
+{\subsubsecfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\raggedright
+ \rm #1\hfill}}%
\ifdim \parskip<10pt \kern 10pt\kern -\parskip\fi \penalty 10000}
@@ -1451,17 +1793,20 @@ July\or August\or September\or October\or November\or December\fi
% Finish up the main text and prepare to read what we've written
% to \contentsfile.
+\newskip\contentsrightmargin \contentsrightmargin=1in
\def\startcontents#1{%
\ifnum \pageno>0
\pagealignmacro
\immediate\closeout \contentsfile
\pageno = -1 % Request roman numbered pages.
\fi
- \unnumbchapmacro{#1}\def\thischapter{#1}%
+ % Don't need to put `Contents' or `Short Contents' in the headline.
+ % It is abundantly clear what they are.
+ \unnumbchapmacro{#1}\def\thischapter{}%
\begingroup % Set up to handle contents files properly.
\catcode`\\=0 \catcode`\{=1 \catcode`\}=2 \catcode`\@=11
\raggedbottom % Worry more about breakpoints than the bottom.
- \advance\hsize by -1in % Don't use the full line length.
+ \advance\hsize by -\contentsrightmargin % Don't use the full line length.
}
@@ -1480,7 +1825,9 @@ July\or August\or September\or October\or November\or December\fi
\let\chapentry = \shortchapentry
\let\unnumbchapentry = \shortunnumberedentry
% We want a true roman here for the page numbers.
- \secfonts \let\rm = \truesecrm \rm
+ \secfonts
+ \let\rm=\shortcontrm \let\bf=\shortcontbf \let\sl=\shortcontsl
+ \rm
\advance\baselineskip by 1pt % Open it up a little.
\def\secentry ##1##2##3##4{}
\def\unnumbsecentry ##1##2{}
@@ -1501,13 +1848,19 @@ July\or August\or September\or October\or November\or December\fi
% Chapter-level things, for both the long and short contents.
\def\chapentry#1#2#3{\dochapentry{#2\labelspace#1}{#3}}
+
+% See comments in \dochapentry re vbox and related settings
\def\shortchapentry#1#2#3{%
- \line{{#2\labelspace #1}\dotfill\doshortpageno{#3}}%
+ \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\strut\raggedright
+ {#2\labelspace #1}\dotfill\doshortpageno{#3}}%
}
\def\unnumbchapentry#1#2{\dochapentry{#1}{#2}}
\def\shortunnumberedentry#1#2{%
- \line{#1\dotfill\doshortpageno{#2}}%
+ \vbox{\hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\strut\raggedright
+ #1\dotfill\doshortpageno{#2}}%
}
% Sections.
@@ -1519,8 +1872,9 @@ July\or August\or September\or October\or November\or December\fi
\def\unnumbsubsecentry#1#2{\dosubsecentry{#1}{#2}}
% And subsubsections.
-\def\subsubsecentry#1#2#3#4#5#6{\dosubsubsecentry{#2.#3.#4.#5\labelspace#1}{#6}}
-\def\unnumbsubsecentry#1#2{\dosubsubsecentry{#1}{#2}}
+\def\subsubsecentry#1#2#3#4#5#6{%
+ \dosubsubsecentry{#2.#3.#4.#5\labelspace#1}{#6}}
+\def\unnumbsubsubsecentry#1#2{\dosubsubsecentry{#1}{#2}}
% This parameter controls the indentation of the various levels.
@@ -1533,20 +1887,38 @@ July\or August\or September\or October\or November\or December\fi
% if at all possible; hence the \penalty.
\def\dochapentry#1#2{%
\penalty-300 \vskip\baselineskip
- \line{\chapentryfonts #1\dotfill \dopageno{#2}}%
+ % This \vbox (and similar ones in dosecentry etc.) used to be a
+ % \line; changed to permit linebreaks for long headings. See
+ % comments above \majorheading. Here we also use \strut to
+ % keep the top end of the vbox from jamming up against the previous
+ % entry in the table of contents.
+ \vbox{\chapentryfonts
+ \hyphenpenalty=10000\tolerance=5000 % this line and next introduced
+ \parindent=0pt\strut\raggedright % with \line -> \vbox change
+ #1\dotfill
+ \dopageno{#2}}%
\nobreak\vskip .25\baselineskip
}
\def\dosecentry#1#2{%
- \line{\secentryfonts \hskip\tocindent #1\dotfill \dopageno{#2}}%
+ \vbox{\secentryfonts \leftskip=\tocindent
+ \hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\strut\raggedright #1\dotfill
+ \dopageno{#2}}%
}
\def\dosubsecentry#1#2{%
- \line{\subsecentryfonts \hskip2\tocindent #1\dotfill \dopageno{#2}}%
+ \vbox{\subsecentryfonts \leftskip=2\tocindent
+ \hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\strut\raggedright #1\dotfill
+ \dopageno{#2}}%
}
\def\dosubsubsecentry#1#2{%
- \line{\subsubsecentryfonts \hskip3\tocindent #1\dotfill \dopageno{#2}}%
+ \vbox{\subsubsecentryfonts \leftskip=3\tocindent
+ \hyphenpenalty=10000\tolerance=5000
+ \parindent=0pt\strut\raggedright #1\dotfill
+ \dopageno{#2}}%
}
% Space between chapter (or whatever) number and the title.
@@ -1594,14 +1966,15 @@ July\or August\or September\or October\or November\or December\fi
% \def\bull{\leavevmode\copy\bullbox}
% Adapted from the TeXbook's \boxit.
-\dimen0 = 3em % Width of the box.
+{\tentt \global\dimen0 = 3em}% Width of the box.
\dimen2 = .55pt % Thickness of rules
% The text. (`r' is open on the right, `e' somewhat less so on the left.)
\setbox0 = \hbox{\kern-.75pt \tensf error\kern-1.5pt}
\global\setbox\errorbox=\hbox to \dimen0{\hfil
- \vbox{\hsize = \dimen0 \advance\hsize by -5.8pt % Space to left+right.
- \advance\hsize by -2\dimen2 % Rules.
+ \hsize = \dimen0 \advance\hsize by -5.8pt % Space to left+right.
+ \advance\hsize by -2\dimen2 % Rules.
+ \vbox{
\hrule height\dimen2
\hbox{\vrule width\dimen2 \kern3pt % Space to left of text.
\vtop{\kern2.4pt \box0 \kern2.4pt}% Space above/below.
@@ -1621,6 +1994,7 @@ July\or August\or September\or October\or November\or December\fi
\catcode `\$=3 \catcode `\&=4 \catcode `\#=6
\catcode `\^=7 \catcode `\_=8 \catcode `\~=13 \let~=\tie
\catcode `\%=14
+\catcode 43=12
\catcode`\"=12
\catcode`\==12
\catcode`\|=12
@@ -1632,7 +2006,8 @@ July\or August\or September\or October\or November\or December\fi
\let\}=\ptexrbrace
\let\.=\ptexdot
\let\*=\ptexstar
-\def\@={@}%
+\let\dots=\ptexdots
+\def\@{@}%
\let\bullet=\ptexbullet
\let\b=\ptexb \let\c=\ptexc \let\i=\ptexi \let\t=\ptext \let\l=\ptexl
\let\L=\ptexL
@@ -1665,7 +2040,66 @@ July\or August\or September\or October\or November\or December\fi
\def\afterenvbreak{\endgraf \ifdim\lastskip<\aboveenvskipamount
\removelastskip \penalty-50 \vskip\aboveenvskipamount \fi}
-\def\lisp{\aboveenvbreak\begingroup\inENV %This group ends at the end of the @lisp body
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% \cartouche: draw rectangle w/rounded corners around argument
+\font\circle=lcircle10
+\newdimen\circthick
+\newdimen\cartouter\newdimen\cartinner
+\newskip\normbskip\newskip\normpskip\newskip\normlskip
+\circthick=\fontdimen8\circle
+%
+\def\ctl{{\circle\char'013\hskip -6pt}}% 6pt from pl file: 1/2charwidth
+\def\ctr{{\hskip 6pt\circle\char'010}}
+\def\cbl{{\circle\char'012\hskip -6pt}}
+\def\cbr{{\hskip 6pt\circle\char'011}}
+\def\carttop{\hbox to \cartouter{\hskip\lskip
+ \ctl\leaders\hrule height\circthick\hfil\ctr
+ \hskip\rskip}}
+\def\cartbot{\hbox to \cartouter{\hskip\lskip
+ \cbl\leaders\hrule height\circthick\hfil\cbr
+ \hskip\rskip}}
+%
+\newskip\lskip\newskip\rskip
+
+\long\def\cartouche{%
+\begingroup
+ \lskip=\leftskip \rskip=\rightskip
+ \leftskip=0pt\rightskip=0pt %we want these *outside*.
+ \cartinner=\hsize \advance\cartinner by-\lskip
+ \advance\cartinner by-\rskip
+ \cartouter=\hsize
+ \advance\cartouter by 18pt % allow for 3pt kerns on either
+% side, and for 6pt waste from
+% each corner char
+ \normbskip=\baselineskip \normpskip=\parskip \normlskip=\lineskip
+ \vbox\bgroup
+ \baselineskip=0pt\parskip=0pt\lineskip=0pt
+ \carttop
+ \hbox\bgroup
+ \hskip\lskip
+ \vrule\kern3pt
+ \vbox\bgroup
+ \hsize=\cartinner
+ \kern3pt
+ \begingroup
+ \baselineskip=\normbskip
+ \lineskip=\normlskip
+ \parskip=\normpskip
+ \vskip -\parskip
+\def\Ecartouche{%
+ \endgroup
+ \kern3pt
+ \egroup
+ \kern3pt\vrule
+ \hskip\rskip
+ \egroup
+ \cartbot
+ \egroup
+\endgroup
+}}
+
+\def\lisp{\aboveenvbreak
+\begingroup\inENV % This group ends at the end of the @lisp body
\hfuzz=12truept % Don't be fussy
% Make spaces be word-separators rather than space tokens.
\sepspaces %
@@ -1708,7 +2142,7 @@ July\or August\or September\or October\or November\or December\fi
\advance \leftskip by \lispnarrowing
\parindent=0pt
\let\exdent=\internalexdent
-\obeyspaces \obeylines \ninett \rawbackslash
+\obeyspaces \obeylines \ninett \indexfonts \rawbackslash
\def\next##1{}\next}
% This is @display; same as @lisp except use roman font.
@@ -1747,7 +2181,8 @@ July\or August\or September\or October\or November\or December\fi
% @flushleft and @flushright
-\def\flushleft{\begingroup\inENV %This group ends at the end of the @format body
+\def\flushleft{%
+\begingroup\inENV %This group ends at the end of the @format body
\aboveenvbreak
% Make spaces be word-separators rather than space tokens.
\sepspaces %
@@ -1761,7 +2196,8 @@ July\or August\or September\or October\or November\or December\fi
\obeyspaces \obeylines
\def\next##1{}\next}
-\def\flushright{\begingroup\inENV %This group ends at the end of the @format body
+\def\flushright{%
+\begingroup\inENV %This group ends at the end of the @format body
\aboveenvbreak
% Make spaces be word-separators rather than space tokens.
\sepspaces %
@@ -1778,7 +2214,8 @@ July\or August\or September\or October\or November\or December\fi
% @quotation - narrow the margins.
-\def\quotation{\begingroup\inENV %This group ends at the end of the @quotation body
+\def\quotation{%
+\begingroup\inENV %This group ends at the end of the @quotation body
{\parskip=0pt % because we will skip by \parskip too, later
\aboveenvbreak}%
\singlespace
@@ -1827,14 +2264,19 @@ July\or August\or September\or October\or November\or December\fi
%% These parens (in \boldbrax) actually are a little bolder than the
%% contained text. This is especially needed for [ and ]
\def\opnr{{\sf\char`\(}} \def\clnr{{\sf\char`\)}} \def\ampnr{\&}
-\def\lbrb{{\tt\char`\[}} \def\rbrb{{\tt\char`\]}}
+\def\lbrb{{\bf\char`\[}} \def\rbrb{{\bf\char`\]}}
% First, defname, which formats the header line itself.
% #1 should be the function name.
% #2 should be the type of definition, such as "Function".
\def\defname #1#2{%
-\leftskip = 0in %
+% Get the values of \leftskip and \rightskip as they were
+% outside the @def...
+\dimen2=\leftskip
+\advance\dimen2 by -\defbodyindent
+\dimen3=\rightskip
+\advance\dimen3 by -\defbodyindent
\noindent %
\setbox0=\hbox{\hskip \deflastargmargin{\rm #2}\hskip \deftypemargin}%
\dimen0=\hsize \advance \dimen0 by -\wd0 % compute size for first line
@@ -1843,8 +2285,13 @@ July\or August\or September\or October\or November\or December\fi
% Now output arg 2 ("Function" or some such)
% ending at \deftypemargin from the right margin,
% but stuck inside a box of width 0 so it does not interfere with linebreaking
-\rlap{\rightline{{\rm #2}\hskip \deftypemargin}}%
-\tolerance=10000 \hbadness=10000 % Make all lines underfull and no complaints
+{% Adjust \hsize to exclude the ambient margins,
+% so that \rightline will obey them.
+\advance \hsize by -\dimen2 \advance \hsize by -\dimen3
+\rlap{\rightline{{\rm #2}\hskip \deftypemargin}}}%
+% Make all lines underfull and no complaints:
+\tolerance=10000 \hbadness=10000
+\advance\leftskip by -\defbodyindent
{\df #1}\enskip % Generate function name
}
@@ -1860,8 +2307,11 @@ July\or August\or September\or October\or November\or December\fi
% so that it will exit this group.
\def#1{\endgraf\endgroup\medbreak}%
\def#2{\begingroup\obeylines\activeparens\spacesplit#3}%
-\parindent=0in \leftskip=\defbodyindent \rightskip=\defbodyindent %
-\begingroup\obeylines\activeparens\spacesplit#3}
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
+\begingroup %
+\catcode 61=\active %
+\obeylines\activeparens\spacesplit#3}
\def\defmethparsebody #1#2#3#4 {\begingroup\inENV %
\medbreak %
@@ -1869,7 +2319,8 @@ July\or August\or September\or October\or November\or December\fi
% so that it will exit this group.
\def#1{\endgraf\endgroup\medbreak}%
\def#2##1 {\begingroup\obeylines\activeparens\spacesplit{#3{##1}}}%
-\parindent=0in \leftskip=\defbodyindent \rightskip=\defbodyindent %
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
\begingroup\obeylines\activeparens\spacesplit{#3{#4}}}
\def\defopparsebody #1#2#3#4#5 {\begingroup\inENV %
@@ -1879,9 +2330,47 @@ July\or August\or September\or October\or November\or December\fi
\def#1{\endgraf\endgroup\medbreak}%
\def#2##1 ##2 {\def#4{##1}%
\begingroup\obeylines\activeparens\spacesplit{#3{##2}}}%
-\parindent=0in \leftskip=\defbodyindent %
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
\begingroup\obeylines\activeparens\spacesplit{#3{#5}}}
+% These parsing functions are similar to the preceding ones
+% except that they do not make parens into active characters.
+% These are used for "variables" since they have no arguments.
+
+\def\defvarparsebody #1#2#3{\begingroup\inENV% Environment for definitionbody
+\medbreak %
+% Define the end token that this defining construct specifies
+% so that it will exit this group.
+\def#1{\endgraf\endgroup\medbreak}%
+\def#2{\begingroup\obeylines\spacesplit#3}%
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
+\begingroup %
+\catcode 61=\active %
+\obeylines\spacesplit#3}
+
+\def\defvrparsebody #1#2#3#4 {\begingroup\inENV %
+\medbreak %
+% Define the end token that this defining construct specifies
+% so that it will exit this group.
+\def#1{\endgraf\endgroup\medbreak}%
+\def#2##1 {\begingroup\obeylines\spacesplit{#3{##1}}}%
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
+\begingroup\obeylines\spacesplit{#3{#4}}}
+
+\def\defopvarparsebody #1#2#3#4#5 {\begingroup\inENV %
+\medbreak %
+% Define the end token that this defining construct specifies
+% so that it will exit this group.
+\def#1{\endgraf\endgroup\medbreak}%
+\def#2##1 ##2 {\def#4{##1}%
+\begingroup\obeylines\spacesplit{#3{##2}}}%
+\parindent=0in
+\advance\leftskip by \defbodyindent \advance \rightskip by \defbodyindent
+\begingroup\obeylines\spacesplit{#3{#5}}}
+
% Split up #2 at the first space token.
% call #1 with two arguments:
% the first is all of #2 before the space token,
@@ -1905,12 +2394,24 @@ July\or August\or September\or October\or November\or December\fi
\def\defunargs #1{\functionparens \sl
% Expand, preventing hyphenation at `-' chars.
% Note that groups don't affect changes in \hyphenchar.
-\hyphenchar\sl=0
+\hyphenchar\tensl=0
#1%
-\hyphenchar\sl=45
+\hyphenchar\tensl=45
\ifnum\parencount=0 \else \errmessage{unbalanced parens in @def arguments}\fi%
\interlinepenalty=10000
-\endgraf\penalty10000\vskip -\parskip }
+\advance\rightskip by 0pt plus 1fil
+\endgraf\penalty 10000\vskip -\parskip\penalty 10000%
+}
+
+\def\deftypefunargs #1{%
+% Expand, preventing hyphenation at `-' chars.
+% Note that groups don't affect changes in \hyphenchar.
+\functionparens
+\code{#1}%
+\interlinepenalty=10000
+\advance\rightskip by 0pt plus 1fil
+\endgraf\penalty 10000\vskip -\parskip\penalty 10000%
+}
% Do complete processing of one @defun or @defunx line already parsed.
@@ -1919,7 +2420,9 @@ July\or August\or September\or October\or November\or December\fi
\def\deffn{\defmethparsebody\Edeffn\deffnx\deffnheader}
\def\deffnheader #1#2#3{\doind {fn}{\code{#2}}%
-\begingroup\defname {#2}{#1}\defunargs{#3}\endgroup}
+\begingroup\defname {#2}{#1}\defunargs{#3}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
+}
% @defun == @deffn Function
@@ -1928,6 +2431,35 @@ July\or August\or September\or October\or November\or December\fi
\def\defunheader #1#2{\doind {fn}{\code{#1}}% Make entry in function index
\begingroup\defname {#1}{Function}%
\defunargs {#2}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
+}
+
+% @deftypefun int foobar (int @var{foo}, float @var{bar})
+
+\def\deftypefun{\defparsebody\Edeftypefun\deftypefunx\deftypefunheader}
+
+% #1 is the data type. #2 is the name and args.
+\def\deftypefunheader #1#2{\deftypefunheaderx{#1}#2 \relax}
+% #1 is the data type, #2 the name, #3 the args.
+\def\deftypefunheaderx #1#2 #3\relax{%
+\doind {fn}{\code{#2}}% Make entry in function index
+\begingroup\defname {\code{#1} #2}{Function}%
+\deftypefunargs {#3}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
+}
+
+% @deftypefn {Library Function} int foobar (int @var{foo}, float @var{bar})
+
+\def\deftypefn{\defmethparsebody\Edeftypefn\deftypefnx\deftypefnheader}
+
+% #1 is the classification. #2 is the data type. #3 is the name and args.
+\def\deftypefnheader #1#2#3{\deftypefnheaderx{#1}{#2}#3 \relax}
+% #1 is the classification, #2 the data type, #3 the name, #4 the args.
+\def\deftypefnheaderx #1#2#3 #4\relax{%
+\doind {fn}{\code{#3}}% Make entry in function index
+\begingroup\defname {\code{#2} #3}{#1}%
+\deftypefunargs {#4}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
}
% @defmac == @deffn Macro
@@ -1937,6 +2469,7 @@ July\or August\or September\or October\or November\or December\fi
\def\defmacheader #1#2{\doind {fn}{\code{#1}}% Make entry in function index
\begingroup\defname {#1}{Macro}%
\defunargs {#2}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
}
% @defspec == @deffn Special Form
@@ -1944,8 +2477,9 @@ July\or August\or September\or October\or November\or December\fi
\def\defspec{\defparsebody\Edefspec\defspecx\defspecheader}
\def\defspecheader #1#2{\doind {fn}{\code{#1}}% Make entry in function index
-\begingroup\defname {#1}{Special form}%
+\begingroup\defname {#1}{Special Form}%
\defunargs {#2}\endgroup %
+\catcode 61=\other % Turn off change made in \defparsebody
}
% This definition is run if you use @defunx
@@ -1955,6 +2489,8 @@ July\or August\or September\or October\or November\or December\fi
\def\defunx #1 {\errmessage{@defunx in invalid context}}
\def\defmacx #1 {\errmessage{@defmacx in invalid context}}
\def\defspecx #1 {\errmessage{@defspecx in invalid context}}
+\def\deftypefnx #1 {\errmessage{@deftypefnx in invalid context}}
+\def\deftypeunx #1 {\errmessage{@deftypeunx in invalid context}}
% @defmethod, and so on
@@ -1963,7 +2499,8 @@ July\or August\or September\or October\or November\or December\fi
\def\defop #1 {\def\defoptype{#1}%
\defopparsebody\Edefop\defopx\defopheader\defoptype}
-\def\defopheader #1#2#3{\dosubind {fn}{\code{#2}}{on #1}% Make entry in function index
+\def\defopheader #1#2#3{%
+\dosubind {fn}{\code{#2}}{on #1}% Make entry in function index
\begingroup\defname {#2}{\defoptype{} on #1}%
\defunargs {#3}\endgroup %
}
@@ -1972,29 +2509,30 @@ July\or August\or September\or October\or November\or December\fi
\def\defmethod{\defmethparsebody\Edefmethod\defmethodx\defmethodheader}
-\def\defmethodheader #1#2#3{\dosubind {fn}{\code{#2}}{on #1}% entry in function index
-\begingroup\defname {#2}{Operation on #1}%
+\def\defmethodheader #1#2#3{%
+\dosubind {fn}{\code{#2}}{on #1}% entry in function index
+\begingroup\defname {#2}{Method on #1}%
\defunargs {#3}\endgroup %
}
% @defcv {Class Option} foo-class foo-flag
\def\defcv #1 {\def\defcvtype{#1}%
-\defopparsebody\Edefcv\defcvx\defcvheader\defcvtype}
+\defopvarparsebody\Edefcv\defcvx\defcvarheader\defcvtype}
\def\defcvarheader #1#2#3{%
\dosubind {vr}{\code{#2}}{of #1}% Make entry in var index
-\begingroup\defname {#2}{\defcvtype of #1}%
+\begingroup\defname {#2}{\defcvtype{} of #1}%
\defvarargs {#3}\endgroup %
}
% @defivar == @defcv {Instance Variable}
-\def\defivar{\defmethparsebody\Edefivar\defivarx\defivarheader}
+\def\defivar{\defvrparsebody\Edefivar\defivarx\defivarheader}
\def\defivarheader #1#2#3{%
\dosubind {vr}{\code{#2}}{of #1}% Make entry in var index
-\begingroup\defname {#2}{Instance variable of #1}%
+\begingroup\defname {#2}{Instance Variable of #1}%
\defvarargs {#3}\endgroup %
}
@@ -2013,18 +2551,18 @@ July\or August\or September\or October\or November\or December\fi
% This must expand the args and terminate the paragraph they make up
\def\defvarargs #1{\normalparens #1%
\interlinepenalty=10000
-\endgraf\penalty 10000\vskip -\parskip}
+\endgraf\penalty 10000\vskip -\parskip\penalty 10000}
% @defvr Counter foo-count
-\def\defvr{\defmethparsebody\Edefvr\defvrx\defvrheader}
+\def\defvr{\defvrparsebody\Edefvr\defvrx\defvrheader}
\def\defvrheader #1#2#3{\doind {vr}{\code{#2}}%
\begingroup\defname {#2}{#1}\defvarargs{#3}\endgroup}
% @defvar == @defvr Variable
-\def\defvar{\defparsebody\Edefvar\defvarx\defvarheader}
+\def\defvar{\defvarparsebody\Edefvar\defvarx\defvarheader}
\def\defvarheader #1#2{\doind {vr}{\code{#1}}% Make entry in var index
\begingroup\defname {#1}{Variable}%
@@ -2033,19 +2571,43 @@ July\or August\or September\or October\or November\or December\fi
% @defopt == @defvr {User Option}
-\def\defopt{\defparsebody\Edefopt\defoptx\defoptheader}
+\def\defopt{\defvarparsebody\Edefopt\defoptx\defoptheader}
\def\defoptheader #1#2{\doind {vr}{\code{#1}}% Make entry in var index
\begingroup\defname {#1}{User Option}%
\defvarargs {#2}\endgroup %
}
+% @deftypevar int foobar
+
+\def\deftypevar{\defvarparsebody\Edeftypevar\deftypevarx\deftypevarheader}
+
+% #1 is the data type. #2 is the name.
+\def\deftypevarheader #1#2{%
+\doind {vr}{\code{#2}}% Make entry in variables index
+\begingroup\defname {\code{#1} #2}{Variable}%
+\interlinepenalty=10000
+\endgraf\penalty 10000\vskip -\parskip\penalty 10000
+\endgroup}
+
+% @deftypevr {Global Flag} int enable
+
+\def\deftypevr{\defvrparsebody\Edeftypevr\deftypevrx\deftypevrheader}
+
+\def\deftypevrheader #1#2#3{\doind {vr}{\code{#3}}%
+\begingroup\defname {\code{#2} #3}{#1}
+\interlinepenalty=10000
+\endgraf\penalty 10000\vskip -\parskip\penalty 10000
+\endgroup}
+
% This definition is run if you use @defvarx
% anywhere other than immediately after a @defvar or @defvarx.
\def\defvrx #1 {\errmessage{@defvrx in invalid context}}
\def\defvarx #1 {\errmessage{@defvarx in invalid context}}
\def\defoptx #1 {\errmessage{@defoptx in invalid context}}
+\def\deftypevarx #1 {\errmessage{@deftypevarx in invalid context}}
+\def\deftypevrx #1 {\errmessage{@deftypevrx in invalid context}}
% Now define @deftp
% Args are printed in bold, a slight difference from @defvar.
@@ -2054,7 +2616,7 @@ July\or August\or September\or October\or November\or December\fi
% @deftp Class window height width ...
-\def\deftp{\defmethparsebody\Edeftp\deftpx\deftpheader}
+\def\deftp{\defvrparsebody\Edeftp\deftpx\deftpheader}
\def\deftpheader #1#2#3{\doind {tp}{\code{#2}}%
\begingroup\defname {#2}{#1}\deftpargs{#3}\endgroup}
@@ -2068,42 +2630,74 @@ July\or August\or September\or October\or November\or December\fi
% Define cross-reference macros
\newwrite \auxfile
+\newif\ifhavexrefs % True if xref values are known.
+\newif\ifwarnedxrefs % True if we warned once that they aren't known.
+
% \setref{foo} defines a cross-reference point named foo.
\def\setref#1{%
+%\dosetq{#1-title}{Ytitle}%
\dosetq{#1-pg}{Ypagenumber}%
\dosetq{#1-snt}{Ysectionnumberandtype}}
\def\unnumbsetref#1{%
+%\dosetq{#1-title}{Ytitle}%
\dosetq{#1-pg}{Ypagenumber}%
\dosetq{#1-snt}{Ynothing}}
\def\appendixsetref#1{%
+%\dosetq{#1-title}{Ytitle}%
\dosetq{#1-pg}{Ypagenumber}%
\dosetq{#1-snt}{Yappendixletterandtype}}
-% \xref and \pxref generate cross references to specified points.
-
-\def\pxref #1{see \xrefX [#1,,,,,,,]}
-\def\xref #1{See \xrefX [#1,,,,,,,]}
-\def\ref #1{\xrefX [#1,,,,,,,]}
-\def\xrefX [#1,#2,#3,#4,#5,#6]{%
-\setbox1=\hbox{\i{\losespace#5{}}}%
-\setbox0=\hbox{\losespace#3{}}%
-\ifdim \wd0 =0pt \setbox0=\hbox{\losespace#1{}}\fi%
-\ifdim \wd1 >0pt%
-section `\unhbox0' in \unhbox1%
+% \xref, \pxref, and \ref generate cross-references to specified points.
+% For \xrefX, #1 is the node name, #2 the name of the Info
+% cross-reference, #3 the printed node name, #4 the name of the Info
+% file, #5 the name of the printed manual. All but the node name can be
+% omitted.
+%
+\def\pxref#1{see \xrefX[#1,,,,,,,]}
+\def\xref#1{See \xrefX[#1,,,,,,,]}
+\def\ref#1{\xrefX[#1,,,,,,,]}
+\def\xrefX[#1,#2,#3,#4,#5,#6]{\begingroup%
+\def\printedmanual{\ignorespaces #5}%
+\def\printednodename{\ignorespaces #3}%
+%
+\setbox1=\hbox{\printedmanual}%
+\setbox0=\hbox{\printednodename}%
+\ifdim \wd0=0pt%
+\def\printednodename{\ignorespaces #1}%
+%%% Uncommment the following line to make the actual chapter or section title
+%%% appear inside the square brackets.
+%\def\printednodename{#1-title}%
+\fi%
+%
+%
+% If we use \unhbox0 and \unhbox1 to print the node names, TeX does
+% not insert empty discretionaries after hyphens, which means that it
+% will not find a line break at a hyphen in a node names. Since some
+% manuals are best written with fairly long node names, containing
+% hyphens, this is a loss. Therefore, we simply give the text of
+% the node name again, so it is as if TeX is seeing it for the first
+% time.
+\ifdim \wd1>0pt
+section ``\printednodename'' in \cite{\printedmanual}%
\else%
-\refx{#1-snt}{} [\unhbox0], page\tie \refx{#1-pg}{}%
-\fi }
+\turnoffactive%
+\refx{#1-snt}{} [\printednodename], page\tie\refx{#1-pg}{}%
+\fi
+\endgroup}
% \dosetq is the interface for calls from other macros
-\def\dosetq #1#2{{\let\folio=0%
+% Use \turnoffactive so that punctuation chars such as underscore
+% work in node names.
+\def\dosetq #1#2{{\let\folio=0 \turnoffactive%
\edef\next{\write\auxfile{\internalsetq {#1}{#2}}}%
\next}}
-% \internalsetq {foo}{page} expands into CHARACTERS 'xrdef {foo}{...expansion of \Ypage...}
+% \internalsetq {foo}{page} expands into
+% CHARACTERS 'xrdef {foo}{...expansion of \Ypage...}
% When the aux file is read, ' is the escape character
\def\internalsetq #1#2{'xrdef {#1}{\csname #2\endcsname}}
@@ -2112,42 +2706,60 @@ section `\unhbox0' in \unhbox1%
\def\Ypagenumber{\folio}
+\def\Ytitle{\thischapter}
+
\def\Ynothing{}
\def\Ysectionnumberandtype{%
-\ifnum\secno=0 chapter\xreftie\the\chapno %
-\else \ifnum \subsecno=0 section\xreftie\the\chapno.\the\secno %
+\ifnum\secno=0 Chapter\xreftie\the\chapno %
+\else \ifnum \subsecno=0 Section\xreftie\the\chapno.\the\secno %
\else \ifnum \subsubsecno=0 %
-section\xreftie\the\chapno.\the\secno.\the\subsecno %
+Section\xreftie\the\chapno.\the\secno.\the\subsecno %
\else %
-section\xreftie\the\chapno.\the\secno.\the\subsecno.\the\subsubsecno %
+Section\xreftie\the\chapno.\the\secno.\the\subsecno.\the\subsubsecno %
\fi \fi \fi }
\def\Yappendixletterandtype{%
-\ifnum\secno=0 appendix\xreftie'char\the\appendixno %
-\else \ifnum \subsecno=0 section\xreftie'char\the\appendixno.\the\secno %
+\ifnum\secno=0 Appendix\xreftie'char\the\appendixno{}%
+\else \ifnum \subsecno=0 Section\xreftie'char\the\appendixno.\the\secno %
\else \ifnum \subsubsecno=0 %
-section\xreftie'char\the\appendixno.\the\secno.\the\subsecno %
+Section\xreftie'char\the\appendixno.\the\secno.\the\subsecno %
\else %
-section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
+Section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
\fi \fi \fi }
\gdef\xreftie{'tie}
+% Use TeX 3.0's \inputlineno to get the line number, for better error
+% messages, but if we're using an old version of TeX, don't do anything.
+%
+\ifx\inputlineno\thisisundefined
+ \let\linenumber = \empty % Non-3.0.
+\else
+ \def\linenumber{\the\inputlineno:\space}
+\fi
+
% Define \refx{NAME}{SUFFIX} to reference a cross-reference string named NAME.
% If its value is nonempty, SUFFIX is output afterward.
\def\refx#1#2{%
-{%
-\expandafter\ifx\csname X#1\endcsname\relax
-% If not defined, say something at least.
-\expandafter\gdef\csname X#1\endcsname {$\langle$un\-def\-in\-ed$\rangle$}#2%
-\message {WARNING: Cross-reference "#1" used but not yet defined}%
-\message {}%
-\fi %
-\setbox0=\hbox{\csname X#1\endcsname}%It's defined, so just use it.
-\ifdim\wd0>0pt \unhbox0{}#2\fi
-}}
+ \expandafter\ifx\csname X#1\endcsname\relax
+ % If not defined, say something at least.
+ $\langle$un\-de\-fined$\rangle$%
+ \ifhavexrefs
+ \message{\linenumber Undefined cross reference `#1'.}%
+ \else
+ \ifwarnedxrefs\else
+ \global\warnedxrefstrue
+ \message{Cross reference values unknown; you must run TeX again.}%
+ \fi
+ \fi
+ \else
+ % It's defined, so just use it.
+ \csname X#1\endcsname
+ \fi
+ #2% Output the suffix in any case.
+}
% Read the last existing aux file, if any. No error if none exists.
@@ -2210,7 +2822,7 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
\catcode `\'=0
\catcode `\\=\other
\openin 1 \jobname.aux
-\ifeof 1 \else \closein 1 \input \jobname.aux
+\ifeof 1 \else \closein 1 \input \jobname.aux \global\havexrefstrue
\fi
% Open the new aux file. Tex will close it automatically at exit.
\openout \auxfile=\jobname.aux
@@ -2223,21 +2835,26 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
\def\supereject{\par\penalty -20000\footnoteno =0 }
+% @footnotestyle is meaningful for info output only..
+\let\footnotestyle=\comment
+
\let\ptexfootnote=\footnote
{\catcode `\@=11
-\gdef\footnote{\global\advance \footnoteno by \@ne
+\long\gdef\footnote #1{\global\advance \footnoteno by \@ne
+\unskip
\edef\thisfootno{$^{\the\footnoteno}$}%
\let\@sf\empty
\ifhmode\edef\@sf{\spacefactor\the\spacefactor}\/\fi
-\thisfootno\@sf\parsearg\footnotezzz}
+\thisfootno\@sf \footnotezzz{#1}}
+% \parsearg\footnotezzz}
-\gdef\footnotezzz #1{\insert\footins{
+\long\gdef\footnotezzz #1{\insert\footins{
\interlinepenalty\interfootnotelinepenalty
\splittopskip\ht\strutbox % top baseline for broken footnotes
\splitmaxdepth\dp\strutbox \floatingpenalty\@MM
\leftskip\z@skip \rightskip\z@skip \spaceskip\z@skip \xspaceskip\z@skip
-\footstrut\hang\textindent{\thisfootno}#1\strut}}
+\footstrut\parindent=\defaultparindent\hang\textindent{\thisfootno}#1\strut}}
}%end \catcode `\@=11
@@ -2256,8 +2873,9 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
% Set some numeric style parameters, for 8.5 x 11 format.
-\hsize = 6.5in
-\parindent 15pt
+%\hsize = 6.5in
+\newdimen\defaultparindent \defaultparindent = 15pt
+\parindent = \defaultparindent
\parskip 18pt plus 1pt
\baselineskip 15pt
\advance\topskip by 1.2cm
@@ -2265,6 +2883,16 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
% Prevent underfull vbox error messages.
\vbadness=10000
+% Use TeX 3.0's \emergencystretch to help line breaking, but if we're
+% using an old version of TeX, don't do anything. We want the amount of
+% stretch added to depend on the line length, hence the dependence on
+% \hsize. This makes it come to about 9pt for the 8.5x11 format.
+%
+\ifx\emergencystretch\thisisundefined \else
+ \emergencystretch = \hsize
+ \divide\emergencystretch by 45
+\fi
+
% Use @smallbook to reset parameters for 7x9.5 format
\def\smallbook{
\global\lispnarrowing = 0.3in
@@ -2275,24 +2903,67 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
\global\vsize=7.5in
\global\tolerance=700
\global\hfuzz=1pt
+\global\contentsrightmargin=0pt
\global\pagewidth=\hsize
\global\pageheight=\vsize
-\global\font\ninett=cmtt9
\global\let\smalllisp=\smalllispx
\global\let\smallexample=\smalllispx
\global\def\Esmallexample{\Esmalllisp}
}
+% Use @afourpaper to print on European A4 paper.
+\def\afourpaper{
+\global\tolerance=700
+\global\hfuzz=1pt
+
+\global\vsize= 53\baselineskip
+\advance\vsize by \topskip
+\global\hsize= 5.85in % A4 wide 10pt
+
+\global\pagewidth=\hsize
+\global\pageheight=\vsize
+}
+
%% For a final copy, take out the rectangles
%% that mark overfull boxes (in case you have decided
%% that the text looks ok even though it passes the margin).
\def\finalout{\overfullrule=0pt}
+% Define macros to output various characters with catcode for normal text.
+\catcode`\"=\other
+\catcode`\~=\other
+\catcode`\^=\other
+\catcode`\_=\other
+\catcode`\|=\other
+\catcode`\<=\other
+\catcode`\>=\other
+\catcode`\+=\other
+\def\normaldoublequote{"}
+\def\normaltilde{~}
+\def\normalcaret{^}
+\def\normalunderscore{_}
+\def\normalverticalbar{|}
+\def\normalless{<}
+\def\normalgreater{>}
+\def\normalplus{+}
+
+% This macro is used to make a character print one way in ttfont
+% where it can probably just be output, and another way in other fonts,
+% where something hairier probably needs to be done.
+%
+% #1 is what to print if we are indeed using \tt; #2 is what to print
+% otherwise. Since all the Computer Modern typewriter fonts have zero
+% interword stretch (and shrink), and it is reasonable to expect all
+% typewriter fonts to have this, we can check that font parameter.
+%
+\def\ifusingtt#1#2{\ifdim \fontdimen3\the\font=0pt #1\else #2\fi}
+
% Turn off all special characters except @
-% (and those which the user can use as if they were ordinary)
-% Define certain chars to be always in tt font.
+% (and those which the user can use as if they were ordinary).
+% Most of these we simply print from the \tt font, but for some, we can
+% use math or other variants that look better in normal text.
\catcode`\"=\active
\def\activedoublequote{{\tt \char '042}}
@@ -2302,8 +2973,19 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
\chardef\hat=`\^
\catcode`\^=\active
\def^{{\tt \hat}}
+
\catcode`\_=\active
-\def_{{\tt \char '137}}
+\def_{\ifusingtt\normalunderscore\_}
+% Subroutine for the previous macro.
+\def\_{\lvvmode \kern.06em \vbox{\hrule width.3em height.1ex}}
+
+% \lvvmode is equivalent in function to \leavevmode.
+% Using \leavevmode runs into trouble when written out to
+% an index file due to the expansion of \leavevmode into ``\unhbox
+% \voidb@x'' ---which looks to TeX like ``\unhbox \voidb\x'' due to our
+% magic tricks with @.
+\def\lvvmode{\vbox to 0pt{}}
+
\catcode`\|=\active
\def|{{\tt \char '174}}
\chardef \less=`\<
@@ -2317,11 +2999,27 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
%\catcode 27=\active
%\def^^[{$\diamondsuit$}
+% Used sometimes to turn off (effectively) the active characters
+% even after parsing them.
+\def\turnoffactive{\let"=\normaldoublequote
+\let~=\normaltilde
+\let^=\normalcaret
+\let_=\normalunderscore
+\let|=\normalverticalbar
+\let<=\normalless
+\let>=\normalgreater
+\let+=\normalplus}
+
+% Set up an active definition for =, but don't enable it most of the time.
+{\catcode`\==\active
+\global\def={{\tt \char 61}}}
+
\catcode`\@=0
% \rawbackslashxx output one backslash character in current font
-{\catcode`\\=\other
-@gdef@rawbackslashxx{\}}
+\global\chardef\rawbackslashxx=`\\
+%{\catcode`\\=\other
+%@gdef@rawbackslashxx{\}}
% \rawbackslash redefines \ as input to do \rawbackslashxx.
{\catcode`\\=\active
@@ -2333,7 +3031,7 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
% Say @foo, not \foo, in error messages.
\escapechar=`\@
-@c \catcode 17=0 @c Define control-q
+% \catcode 17=0 % Define control-q
\catcode`\\=\active
% If a .fmt file is being used, we don't want the `\input texinfo' to show up.
@@ -2355,3 +3053,7 @@ section\xreftie'char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno %
@textfonts
@rm
+
+@c Local variables:
+@c page-delimiter: "^\\\\message"
+@c End:
diff --git a/vms/gawk.hlp b/vms/gawk.hlp
index 68892393..660e0353 100644
--- a/vms/gawk.hlp
+++ b/vms/gawk.hlp
@@ -5,10 +5,10 @@
!
1 GAWK
GAWK is GNU awk, the Free Software Foundation's implementation of
- the awk programming language. awk is an interperative language which
+ the awk programming language. awk is an interpretive language which
can handle many data-reformatting jobs with just a few lines of code.
It has powerful string manipulation and pattern matching capabilities
- built in. This version should be compatable with POSIX 1003.2 awk.
+ built in. This version should be compatible with POSIX 1003.2 awk.
The VMS version of GAWK supports both the original UN*X-style command
interface and a DCL interface. The only setup requirement for GAWK
@@ -38,10 +38,10 @@
-v var=val assign a value of 'val' to the variable 'var'
-W 'options' additional gawk-specific options; multiple values may
be separated by commas, or by spaces if they're quoted,
- or mulitple occurences of -W may be used.
+ or mulitple occurrences of -W may be used.
-W compat use awk "compatibility mode" to disable GAWK extensions
and get the behavior of UN*X awk.
- -W copyright [or -W copyleft] display an abbreivated version of
+ -W copyright [or -W copyleft] display an abbreviated version of
the GNU copyright information
-W lint warn about suspect or non-portable awk program code
-W posix compatibility mode with additional restrictions
@@ -54,13 +54,13 @@
$ gawk -- "BEGIN {print ""\nHello, World!\n""}"
This program would print a blank line (based on first "\n"), followed
by a line reading "Hello, World!", followed by another blank line
- (since awk's 'print' statement includes trailing 'newline').
+ (since awk's 'print' statement includes the trailing 'newline').
On VMS, to include a quote character inside of a quoted string, two
successive quotes ("") must be used.
3 data_files
After all dash-options are examined, and after the program text if
- there were no occurences of the -f option, remaining (space separated)
+ there were no occurrences of the -f option, remaining (space separated)
command line arguments are considered to be data files for the awk
program to process. If any of these actually contains an equals sign
(=), then it is interpreted as a variable assignment instead of a data
@@ -88,8 +88,8 @@
implied carriage control)
2>&1 route error messages into the regular output stream
1>&2 send output data to the error destination
- <<sentinal error; reading stdin until 'sentinal' not supported
- <-, >- error; closer of stdin or stdout from cmd line not supported
+ <<sentinel error; reading stdin until 'sentinel' not supported
+ <-, >- error; closure of stdin or stdout from cmd line not supported
>>$vfile incorrect; would be interpreted as file "$vfile" in stream-lf
format rather than as file "vfile" in RMS 'text' format
| error; command line pipes not supported
@@ -141,7 +141,7 @@
subtopic 'GAWK GNU_syntax wildcard_expansion' for details.
At least one data_file parameter value is required. An exception is
- made if /usage, /version, or /copyright is specifed *and* if GAWK is
+ made if /usage, /version, or /copyright is specified *and* if GAWK is
defined as a 'foreign' command rather than a 'native' DCL command.
3 Qualifiers
/COMMANDS
@@ -153,7 +153,7 @@
$ gawk/commands="BEGIN {print ""\nHello, World!\n""}" NL:
This program would print a blank line (based on first "\n"), followed
by a line reading "Hello, World!", followed by another blank line
- (since awk's 'print' statement includes trailing 'newline').
+ (since awk's 'print' statement includes the trailing 'newline').
To include a quote character inside of a quoted string, two
successive quotes ("") must be used.
@@ -181,15 +181,6 @@
/REG_EXPR
/REG_EXPR={AWK | EGREP | POSIX} (-a vs -e options [obsolete])
- Specify regular expression syntax.
-
- /REG_EXPR=AWK use the original awk syntax for regular expressions
- /REG_EXPR=EGREP use the egrep syntax for regular expressions
- /REG_EXPR=POSIX equivalent to /REG_EXPR=EGREP
-
- If /REG_EXTR is omitted, then /REG_EXPR=AWK is the default. However,
- if /REG_EXTR is included but its value is omitted, EGREP is used.
-
This qualifier is obsolete and has no effect.
/STRICT
/[NO]STRICT (-"W compat" option)
@@ -322,9 +313,10 @@
(and also of a comparison operation) will be 0 when false
or 1 when true
|| or [expression (a || b) is true if either a is true or b
- is true or both a and b are true; it is false otherwise]
+ is true or both a and b are true; it is false otherwise;
+ b is not evaluated unless a is false (ie, short-circuit)]
&& and [expression (a && b) is true if both a and b are true;
- it is false otherwise]
+ it is false otherwise; b is only evaluated if a is true]
! not [expression (!a) is true if a is false, false otherwise]
in array membership; the keyword 'in' tests whether the value
on the left represents a current subscript in the array
@@ -332,7 +324,7 @@
Conditional operator
? : the conditional operator takes three operands; the first is
an expression to evaluate, the second is the expression to
- use if the first was true, the third is the expession to
+ use if the first was true, the third is the expression to
use if it was false [simple example (a < b ? b : a) gives
the maximum of a and b]
Assignment operators
@@ -357,7 +349,7 @@
there is no explicit operator for conversion; adding 0
to a string with force it to be converted to a number
(the numeric value will be 0 if the string does not
- represent a decimal or floating point number); the
+ represent an integer or floating point number); the
reverse, converting a number into a string, is done by
concatenating a null string ("") to it [the expression
(5.75 "") evaluates to "5.75"]
@@ -378,7 +370,7 @@
awk rule]
Escape 'operator'
\ In quoted character strings, the backslash (\) character
- causes the following character to be intrepreted in a
+ causes the following character to be interpreted in a
special manner [string "one\ntwo" has an embedded newline
character (linefeed on VMS, but treated as if it were both
carriage-return and linefeed); string "\033[" has an ASCII
@@ -402,7 +394,7 @@
unary plus (+), unary minus (-), boolean not (!)
multiplication (*), division (/), remainder (%)
addition (+), subtraction (-)
- concatentation (no special symbol; implied by context)
+ concatenation (no special symbol; implied by context)
relational (==, !=, <, >=, etc), and redirection (<, >, >>, |)
Relational and redirection operators have the same precedence
and use similar symbols; context distinguishes between them
@@ -413,9 +405,9 @@
conditional (? :)
assignment (=, +=, etc)
4 escaped_characters
- Inside of a quoted string, the backslash (\) character gives special
- meaning the the character(s) after it. Special character letters
- are case sensitive.
+ Inside of a quoted string or constant regular expression, the
+ backslash (\) character gives special meaning to the character(s)
+ after it. Special character letters are case sensitive.
\\ results in one backslash in the string
\a is an 'alert' (<ctrl/G>. the ASCII <bell> character)
\b is a backspace (BS, <ctrl/H>)
@@ -428,12 +420,12 @@
\### is an arbitrary character, where '###' represents 1 to 3
octal (ie, 0 thru 7) digits
\x## is an alternate arbitrary character, where '##' represents
- 1 or more hexadecimal (ie, 0 thru 9 and/or A thru E and/or
- a thru e) digits; if more than two digits follow, the
- result is undefined; not recognized if POSIX compatibility
- mode is specified.
+ 1 or more hexadecimal (ie, 0 thru 9 and/or A through E
+ and/or a through e) digits; if more than two digits
+ follow, the result is undefined; not recognized if POSIX
+ compatibility mode is specified.
3 statements
- A statement refers to a unit of intruction found in the action
+ A statement refers to a unit of instruction found in the action
part of an awk rule, and also found in the definition of a function.
The distinction between action, statement, and expression usually
won't matter to an awk programmer.
@@ -512,7 +504,8 @@
5 while_example
# strip fields from the input record until there's nothing left
while (NF > 0) {
- $1 = "" #this causes $0 to be reconstructed
+ $1 = "" #this will affect the value of $0
+ $0 = $0 #this causes $0 and NF to be re-evaluated
print
}
5 do_while_example
@@ -521,11 +514,12 @@
# echo input record until all fields have been stripped
do {
print #output $0
- $1 = "" #this causes $0 to be reconstructed
+ $1 = "" #this will affect the value of $0
+ $0 = $0 #this causes $0 and NF to be re-evaluated
} while (NF > 0)
5 for_example
- # print the ASCII alphabet (in lowercase)
- for ( letter = 'a'; letter <= 'z'; letter++ ) print letter
+ # echo command line arguments (won't include option switches)
+ for ( i = 0; i < ARGC; i++ ) print ARGV[i]
# display contents of builtin environment array
for (itm in ENVIRON)
@@ -594,11 +588,9 @@
3 fields, the value of $5 would be "").
Assigning a new value to $0 causes all the other field values (and NF)
- to be re-evaluated. Changing a specific field, causes $0 to receive
- a new value, but the other existing fields remain unchanged.
-
- For efficiency, gawk only performs field splitting at the first time
- a specific field (or NF) is actually needed.
+ to be re-evaluated. Changing a specific field will cause $0 to receive
+ a new value once it's re-evaluated, but until then the other existing
+ fields remain unchanged.
3 variables
Variables in awk can hold both numeric and string values and do not
have to be pre-declared. In fact, there is no way to explicitly
@@ -720,13 +712,12 @@
on them, and returns a single result.
The syntax for calling a function consists of the function name
- immediately followed by an open paren (left parenthesis '('),
- optionally followed by white space (spaces and/or tabs), followed
- by an appropriate argument value (number, string, variable, array
- reference, or expression involving the above and/or nested function
- call), optionally followed by more white space. That is followed by
- either a closing paren (right parenthesis, ')'), or by a comma (,)
- and another argument and so on until finally a closing paren.
+ immediately followed by an open parenthesis (left parenthesis '('),
+ followed by an argument list, followed by a closing parenthesis
+ (right parenthesis ')'). The argument list is a sequence of values
+ (numbers, strings, variables, array references, or expressions
+ involving the above and/or nested function calls), separated by
+ commas and optional white space.
The parentheses are required punctuation, except for the 'print' and
'printf' builtin IO functions, where they're optional, and for the
@@ -741,7 +732,7 @@
exp(n) the exponential of n ('e' raised to the 'n'th power)
log(n) natural logarithm of n
sin(n) sine of n (in radians)
- cos(n) cosine of n
+ cos(n) cosine of n (radians)
atan2(m,n) arctangent of m/n (radians)
rand() random number in the range 0 to 1 (exclusive)
srand(s) sets the random number 'seed' to s, so that a sequence
@@ -753,8 +744,9 @@
Builtin string functions
index(s,t) search string s for substring t; result is 1-based
offset of t within s, or 0 if not found
- length(s) returns the length of string s; 'length' without
- parenthesized argument returns length of $0
+ length(s) returns the length of string s; either 'length()'
+ with its argument omitted or 'length' without any
+ parenthesized argument list will return length of $0
match(s,r) search string s for regular expression r; the offset
of the longest, left-most substring which matches
is returned, or 0 if no match was found; the builtin
@@ -813,6 +805,7 @@
e day of month with leading space instead of leading 0 ( 1-31)
E ignored; following format character used
H hour (24 hour clock) as two digit number (00-23)
+ h abbreviated month name (Jan,Feb,...) [same as %b]
I hour (12 hour clock) as two digit number (01-12)
j day of year as three digit number (001-366)
m month as two digit number (01-12)
@@ -857,7 +850,7 @@
note: parentheses around the argument are *not*
allowed; return value is 1 for successful read, 0
if end of file is encountered, or -1 if some sort
- of error occured; [see 'redirection' for several
+ of error occurred; [see 'redirection' for several
variants]
close(s) close a file or pipe specified by the string s; the
string used should have the same value as the one
@@ -922,14 +915,15 @@
percent sign (%))
% include a literal percent sign (%) in the result
c format the next argument as a single ASCII character
- (argument should be numeric in the range 0 to 255)
+ (prints first character of string argument, or corresponding
+ ASCII character if numeric argument, e.g. 65 is 'A')
s format the next argument as a string (numeric arguments are
converted into strings on demand)
d decimal number (ie, integer value in base 10)
i integer (equivalent to decimal)
o octal number (integer in base 8)
- x hecadecimal number (integer in base 16) [lowercase]
- X hecadecimal number [digits 'A' thru 'E' in uppercase]
+ x hexadecimal number (integer in base 16) [lowercase]
+ X hexadecimal number [digits 'A' thru 'E' in uppercase]
f floating point number (digits, decimal point, fraction digits)
e exponential (scientific notation) number (digit, decimal
point, fraction digits, letter 'e', sign '+' or '-',
@@ -1076,7 +1070,7 @@
with the current command line I/O redirection. '>>$' isn't supported.
4 RS_peculiarities
Changing the record separator to something other than newline ('\n')
- will produce anomolous results for ordinary files. For example,
+ will produce anomalous results for ordinary files. For example,
using RS = "\f" and FS = "\n" with the following input
|rec 1, line 1
|rec 1, line 2
@@ -1094,7 +1088,7 @@
The following awk code will work-around this problem by inserting
a null first field in the first record, so that all records can be
handled the same by subsequent processing.
- # fixup for first record (RS != "\n")
+ # fix up for first record (RS != "\n")
FNR == 1 { if ( $0 == "" ) #leading separator
next #skip its null record
else #otherwise,
@@ -1109,7 +1103,7 @@
a pair of null fields separated by that newline. The following code
fragment will fix that provided there are no null records (in this
case, that would be two consecutive lines containing just form-feeds).
- # fixup for last record (RS != "\n")
+ # fix up for last record (RS != "\n")
$0 == FS { next } #drop spurious final record
Note that the "record not terminated" warning will persist.
4 cmd_inconsistency
@@ -1131,7 +1125,7 @@
'lint' and 'posix' run-time options added
'-W' command line option syntax supercedes '-c', '-C', and '-V'
'-a' and '-e' regular expression options made obsolete
- Various bug fixes and effiency improvements
+ Various bug fixes and efficiency improvements
More platforms supported ('officially' including VMS)
VMS-specific
@@ -1141,6 +1135,8 @@
Problem redirecting stderr (>&efile) at same time as stdin (<ifile)
or stdout (>ofile) has been fixed
``2>&1'' and ``1>&2'' redirection constructs added
+ Interaction between command line I/O redirection and gawk pipes
+ fixed; also, name used for pseudo-pipe temporary file expanded
3 license
GAWK is covered by the "GNU General Public License", the gist of which
is that if you supply this software to a third party, you are expressly
diff --git a/vms/vms.h b/vms/vms.h
index 6491a1f5..378adba2 100644
--- a/vms/vms.h
+++ b/vms/vms.h
@@ -33,11 +33,19 @@
#define CLI$_NOOPTPRS 0x00038840 /* no option present */
#endif
+#if 0
+#include <psldef.h>
+#else
+#define PSL$C_USER 3 /* user mode */
+#endif
+
#if !defined(_TYPES_) || !defined(__GNUC__)
typedef unsigned long u_long;
typedef unsigned short u_short;
#endif
typedef struct _dsc { int len; char *adr; } Dsc; /* limited string descriptor */
+ /* standard VMS itemlist-3 structure */
+typedef struct _itm { u_short len, code; void *buffer; u_short *retlen; } Itm;
#define vmswork(sts) ((sts)&1)
#define vmsfail(sts) (!vmswork(sts))
@@ -55,6 +63,13 @@ extern u_long SYS$QIO P((long, short, long, void *, const void *, long,
const char *, int, int, u_long, int, int));
extern u_long SYS$SYNCH P((long, void *));
#endif !NO_TTY_FWRITE
+ /* system services for logical name manipulation */
+extern u_long SYS$TRNLNM P((const u_long *,const Dsc *,const Dsc *,
+ const unsigned char *,Itm *));
+extern u_long SYS$CRELNM P((const u_long *,const Dsc *,const Dsc *,
+ const unsigned char *,const Itm *));
+extern u_long SYS$CRELOG P((int,const Dsc *,const Dsc *,unsigned char));
+extern u_long SYS$DELLNM P((const Dsc *,const Dsc *,const unsigned char *));
extern void v_add_arg P((int, const char *));
extern void vms_exit P((int));
diff --git a/vms/vms_args.c b/vms/vms_args.c
index b6736ff3..b317d8d0 100644
--- a/vms/vms_args.c
+++ b/vms/vms_args.c
@@ -11,8 +11,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,7 +21,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
@@ -73,6 +73,7 @@
#include "awk.h" /* really "../awk.h" */
#include "vms.h"
+#include <lnmdef.h>
void v_add_arg(int, const char *);
static char *skipblanks(const char *);
@@ -236,13 +237,13 @@ ordinary_arg:
&& (f_err[len] == ':' || f_err[len] == '\0'))
err_to_out_redirect = 1;
else
- vms_define("SYS$ERROR", f_err);
+ (void) vms_define("SYS$ERROR", f_err);
}
/* do stdin before stdout, so we bomb we won't create empty output file */
if (f_in) { /* [re]open file and define logical name */
stdin = freopen(f_in, "r", stdin, "mbf=2");
if (stdin != NULL)
- vms_define("SYS$INPUT", f_in);
+ (void) vms_define("SYS$INPUT", f_in);
else
fatal("<%s (%s)", f_in, strerror(errno));
}
@@ -254,21 +255,21 @@ ordinary_arg:
# define BIGBUF 8*BUFSIZ /* maximum record size: 4096 instead of 512 */
setvbuf(stdout, malloc(BIGBUF), _IOFBF, BIGBUF);
#endif
- vms_define("SYS$OUTPUT", f_out);
+ (void) vms_define("SYS$OUTPUT", f_out);
} else
fatal(">%s%s (%s)", (*out_mode == 'a' ? ">" : ""),
f_out, strerror(errno));
}
if (err_to_out_redirect) { /* special case for ``2>&1'' construct */
- fclose(stderr);
- dup(1, 2); /* make file 2 (stderr) share file 1 (stdout) */
+ (void) fclose(stderr);
+ (void) dup2(1, 2); /* make file 2 (stderr) share file 1 (stdout) */
stderr = stdout;
- vms_define("SYS$ERROR", "SYS$OUTPUT:");
+ (void) vms_define("SYS$ERROR", "SYS$OUTPUT:");
} else if (out_to_err_redirect) { /* ``1>&2'' */
- fclose(stdout);
- dup(2, 1); /* make file 1 (stdout) share file 2 (stderr) */
+ (void) fclose(stdout);
+ (void) dup2(2, 1); /* make file 1 (stdout) share file 2 (stderr) */
stdout = stderr;
- vms_define("SYS$OUTPUT", "SYS$ERROR:");
+ (void) vms_define("SYS$OUTPUT", "SYS$ERROR:");
}
#ifndef NO_DCL_CMD
@@ -366,18 +367,21 @@ skipblanks( const char *ptr )
static u_long
vms_define( const char *log_name, const char *trans_val )
{
- Dsc log_dsc, trn_dsc;
-# define LOG_PROCESS_TABLE 2 /* <obsolete> */
-# define LOG_USERMODE 3 /* PSL$C_USER */
- extern u_long SYS$CRELOG(); /* <superceded by $CRELNM> */
+ Dsc log_dsc;
+ static Descrip(lnmtable,"LNM$PROCESS_TABLE");
+ static long attr = LNM$M_CONFINE;
+ static Itm itemlist[] = { {sizeof attr,LNM$_ATTRIBUTES,&attr,0},
+ {0,LNM$_STRING,0,0}, {0,0} };
+ static unsigned char acmode = PSL$C_USER;
/* avoid "define SYS$OUTPUT sys$output:" for redundant ">sys$output:" */
if (strncasecmp(log_name, trans_val, strlen(log_name)) == 0)
return 0;
log_dsc.len = strlen(log_dsc.adr = (char *)log_name);
- trn_dsc.len = strlen(trn_dsc.adr = (char *)trans_val);
- return SYS$CRELOG(LOG_PROCESS_TABLE, &log_dsc, &trn_dsc, LOG_USERMODE);
+ itemlist[1].buffer = (char *)trans_val;
+ itemlist[1].len = strlen(trans_val);
+ return SYS$CRELNM((u_long *)0, &lnmtable, &log_dsc, &acmode, itemlist);
}
/* t_strstr -- strstr() substitute; search 'str' for 'sub' */
diff --git a/vms/vms_fwrite.c b/vms/vms_fwrite.c
index c0282c14..94c345a5 100644
--- a/vms/vms_fwrite.c
+++ b/vms/vms_fwrite.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h" /* really "../awk.h" */
diff --git a/vms/vms_gawk.c b/vms/vms_gawk.c
index 57abff7e..ec4747d3 100644
--- a/vms/vms_gawk.c
+++ b/vms/vms_gawk.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
diff --git a/vms/vms_misc.c b/vms/vms_misc.c
index 8c7aee6a..c7044348 100644
--- a/vms/vms_misc.c
+++ b/vms/vms_misc.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "awk.h" /* really "../awk.h" */
diff --git a/vms/vms_popen.c b/vms/vms_popen.c
index f0eaa037..654364c4 100644
--- a/vms/vms_popen.c
+++ b/vms/vms_popen.c
@@ -10,8 +10,8 @@
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,7 +20,7 @@
*
* You should have received a copy of the GNU General Public License
* along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef NO_VMS_PIPES
@@ -59,21 +59,39 @@ fork()
* Simulate pipes using temporary files; hope that the user
* doesn't expect pipe i/o to be interleaved with other i/o ;-}.
*
- * This is essentially the same as the MSDOS version. The
+ * This was initially based on the MSDOS version, but cannot
+ * use a static array to hold pipe info, because there's no
+ * fixed limit on the range of valid 'fileno's. Another
* difference is that redirection is handled using LIB$SPAWN
* rather than constructing a command for system() which uses
* '<' or '>'.
*/
#include "vms.h"
#include <errno.h>
+#include <lnmdef.h> /* logical name definitions */
+
+static void push_logicals P((void));
+static void pop_logicals P((void));
+static Itm *save_translation P((const Dsc *));
+static void restore_translation P((const Dsc *, const Itm *));
typedef enum { unopened = 0, reading, writing } pipemode;
-static
-struct {
+typedef struct pipe_info {
char *command;
char *name;
pipemode pmode;
-} pipes[_NFILE];
+} PIPE;
+static PIPE *pipes;
+static int pipes_lim = 0;
+
+#define psize(n) ((n) * sizeof(PIPE))
+#define expand_pipes(k) do { PIPE *new_p; \
+ int new_p_lim = ((k) / _NFILE + 1) * _NFILE; \
+ emalloc(new_p, PIPE *, psize(new_p_lim), "expand_pipes"); \
+ if (pipes_lim > 0) \
+ memcpy(new_p, pipes, psize(pipes_lim)), free(pipes); \
+ memset(new_p + psize(pipes_lim), 0, psize(new_p_lim - pipes_lim)); \
+ pipes = new_p, pipes_lim = new_p_lim; } while(0)
FILE *
popen( const char *command, const char *mode )
@@ -91,18 +109,20 @@ popen( const char *command, const char *mode )
return NULL;
/* make a name for the temporary file */
- if ((name = mktemp(strdup("sys$scratch:pipe_XXXX.tmp"))) == 0)
+ if ((name = mktemp(strdup("sys$scratch:gawk-pipe_XXXXXX.tmp"))) == 0)
return NULL;
if (curmode == reading) {
/* an input pipe reads a temporary file created by the command */
vms_execute(command, (char *)0, name); /* 'command >tempfile' */
}
- if ((current = fopen(name, mode)) == NULL) {
+ if ((current = fopen(name, mode, "mbf=2")) == NULL) {
free(name);
return NULL;
}
cur = fileno(current);
+ if (cur >= pipes_lim) expand_pipes(cur);
+ /* assert( cur >= 0 && cur < pipes_lim ); */
pipes[cur].name = name;
pipes[cur].pmode = curmode;
pipes[cur].command = strdup(command);
@@ -114,6 +134,7 @@ pclose( FILE *current )
{
int rval, cur = fileno(current);
+ /* assert( cur >= 0 && cur < pipes_lim ); */
if (pipes[cur].pmode == unopened)
return -1; /* should never happen */
@@ -152,8 +173,10 @@ vms_execute( const char *command, const char *input, const char *output )
else
out_p = 0;
+ push_logicals(); /* guard against user-mode definitions of sys$Xput */
sts = LIB$SPAWN(&cmd, in_p, out_p, (long *)0,
(Dsc *)0, (u_long *)0, &cmpltn_sts);
+ pop_logicals(); /* restore environment */
if (vmswork(sts) && vmsfail(cmpltn_sts)) sts = cmpltn_sts;
if (vmsfail(sts)) {
@@ -163,6 +186,164 @@ vms_execute( const char *command, const char *input, const char *output )
return 0;
}
+/*----*
+ This rigmarole is to guard against interference from the current
+ environment. User-mode definitions of SYS$INPUT and/or SYS$OUTPUT
+ will interact with spawned subprocesses--including LIB$SPAWN with
+ explicit input and/or output arguments specified--if they were
+ defined without the 'CONFINED' attribute. The definitions created
+ in vms_args.c as part of command line I/O redirection happened to
+ fall into this category :-(, but even though that's been fixed,
+ there's still the possibility of the user doing something like
+ |$ define/user sys$output foo.out
+ prior to starting the program. Without ``/name_attr=confine'',
+ that will really screw up pipe simulation, so we've got to work-
+ around it here. This is true whether pipes are implemented via
+ mailboxes or temporary files, as long as lib$spawn() is being used.
+
+ push_logicals() calls save_translation() the first time it's
+ invoked; the latter allocates some memory to hold a full logical
+ name translation and uses $trnlnm to fill that in. Then if either
+ sys$input or sys$output has a user-mode, non-confined translation,
+ push_logicals() will delete the definition(s) using $dellnm.
+ After the spawned command has returned, pop_logicals() is called;
+ it calls restore_translation() for any deleted values; the latter
+ uses $crllnm or $crelog to recreate the original definition.
+
+ SYS$ERROR is currently ignored; perhaps it should receive the same
+ treatment...
+*----*/
+
+ /* logical name table, and names of interest; these are all constant */
+static const Descrip(lnmtable,"LNM$PROCESS_TABLE");
+static const Descrip(sys_input,"SYS$INPUT");
+static const Descrip(sys_output,"SYS$OUTPUT");
+static const unsigned char acmode = PSL$C_USER; /* only care about user-mode */
+
+ /* macros for simplfying the code a bunch */
+#define DelTrans(l) SYS$DELLNM(&lnmtable, (l), &acmode)
+#define GetTrans(l,i) SYS$TRNLNM((u_long *)0, &lnmtable, (l), &acmode, (i))
+#define SetTrans(l,i) SYS$CRELNM((u_long *)0, &lnmtable, (l), &acmode, (i))
+ /* itemlist manipulation macros; separate versions for aggregate and scalar */
+#define SetItmA(i,c,p,r) ((i).code = (c), (i).len = sizeof (p),\
+ (i).buffer = (p), (i).retlen = (u_short *)(r))
+#define SetItmS(i,c,p) ((i).code = (c), (i).len = sizeof *(p),\
+ (i).buffer = (p), (i).retlen = (u_short *)0)
+#define EndItm0(i) ((i).code = (i).len = 0)
+
+ /* translate things once, then hold the results here for multiple re-use */
+static Itm *input_definition, *output_definition;
+
+static void
+push_logicals( void ) /* deassign sys$input and/or sys$output */
+{
+ static int init_done = 0;
+
+ if (!init_done) { /* do logical name lookups one-time only */
+ input_definition = save_translation(&sys_input);
+ output_definition = save_translation(&sys_output);
+ init_done = 1;
+ }
+ if (input_definition) DelTrans(&sys_input); /* kill sys$input */
+ if (output_definition) DelTrans(&sys_output); /* and sys$output */
+}
+
+static void
+pop_logicals( void ) /* redefine sys$input and/or sys$output */
+{
+ if (input_definition) restore_translation(&sys_input, input_definition);
+ if (output_definition) restore_translation(&sys_output, output_definition);
+}
+
+static Itm *
+save_translation( const Dsc *logname )
+{
+ Itm trans[4], *itmlst;
+ long trans_attr, max_trans_indx; /* 0-based translation index count */
+ unsigned char trans_acmode; /* translation's access mode */
+ unsigned itmlst_size;
+ register int i, j;
+
+ itmlst = 0;
+ /* Want translation index count for non-confined, user-mode definition;
+ unfortunately, $trnlnm does not provide that much control. Try to
+ fetch several values of interest, then decide based on the result.
+ */
+ SetItmS(trans[0], LNM$_MAX_INDEX, &max_trans_indx), max_trans_indx = -1;
+ SetItmS(trans[1], LNM$_ACMODE, &trans_acmode), trans_acmode = 0;
+ SetItmS(trans[2], LNM$_ATTRIBUTES, &trans_attr), trans_attr = 0;
+ EndItm0(trans[3]);
+ if (vmswork(GetTrans(logname, trans)) && max_trans_indx >= 0
+ && trans_acmode == PSL$C_USER && !(trans_attr & LNM$M_CONFINE)) {
+ /* Now know that definition of interest exists;
+ allocate and initialize an item list and associated buffers;
+ use three entries for each translation.
+ */
+ itmlst_size = (3 * (max_trans_indx + 1) + 1) * sizeof(Itm);
+ emalloc(itmlst, Itm *, itmlst_size, "save_translation");
+ for (i = 0; i <= max_trans_indx; i++) {
+ struct def { u_long indx, attr; u_short len;
+ char str[LNM$C_NAMLENGTH], eos; } *wrk;
+ emalloc(wrk, struct def *, sizeof (struct def), "save_translation");
+ wrk->indx = (u_long)i; /* this one's an input value for $trnlnm */
+ SetItmS(itmlst[3*i+0], LNM$_INDEX, &wrk->indx);
+ SetItmS(itmlst[3*i+1], LNM$_ATTRIBUTES, &wrk->attr), wrk->attr = 0;
+ SetItmA(itmlst[3*i+2], LNM$_STRING, &wrk->str, &wrk->len), wrk->len = 0;
+ }
+ EndItm0(itmlst[3*i]); /* assert( i == max_trans_indx+1 ); */
+ /* Time to perform full logical name translation,
+ then update item list for subsequent restoration.
+ If there are any holes [don't know whether that's possible]
+ collapse them out of the list; don't want them at restore time.
+ */
+ if (vmswork(GetTrans(logname, itmlst))) {
+ for (i = 0, j = -1; i <= max_trans_indx; i++) {
+ u_long *attr_p;
+ attr_p = itmlst[3*i+1].buffer; /* copy (void *) to true type */
+ if (*attr_p & LNM$M_EXISTS) {
+ *attr_p &= ~LNM$M_EXISTS; /* must clear this bit */
+ if (++j < i) itmlst[3*j+0] = itmlst[3*i+0],
+ itmlst[3*j+1] = itmlst[3*i+1],
+ itmlst[3*j+2] = itmlst[3*i+2];
+ if (itmlst[3*j+2].retlen) { /* fixup buffer length */
+ itmlst[3*j+2].len = *itmlst[3*j+2].retlen;
+ itmlst[3*j+2].retlen = (u_short *)0;
+ }
+ }
+ }
+ if (++j < i) EndItm0(itmlst[3*j]);
+ } else /* should never happen; tolerate potential memory leak */
+ free(itmlst), itmlst = 0; /*('wrk' buffer(s) will become lost)*/
+ }
+ return itmlst;
+}
+
+static void
+restore_translation( const Dsc *logname, const Itm *itemlist )
+{
+ Dsc trans_val;
+ u_long *attr_p;
+# define LOG_PROCESS_TABLE 2 /* <obsolete> */
+# define LOG_USERMODE PSL$C_USER
+
+ /* assert( itemlist[1].code == LNM$_ATTRIBUTES ); */
+ attr_p = itemlist[1].buffer; /* copy (void *) to (u_long *) */
+ if (*attr_p & LNM$M_CRELOG) { /* check original creation method */
+ /* $crelog values can have only one translation;
+ so it'll be the first string entry in the itemlist.
+ */
+ /* assert( itemlist[2].code == LNM$_STRING ); */
+ trans_val.adr = itemlist[2].buffer;
+ trans_val.len = itemlist[2].len;
+ (void) SYS$CRELOG(LOG_PROCESS_TABLE, logname, &trans_val, LOG_USERMODE);
+ } else {
+ /* $crelnm definition; itemlist could specify multiple translations,
+ but has already been setup properly for use as-is.
+ */
+ (void) SetTrans(logname, itemlist);
+ }
+}
+
#endif /* PIPES_SIMULATED */
#endif /*!NO_VMS_PIPES*/