From 28ef1abc10cfbc2c3d2747c008eb2300858d0426 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Fri, 22 Apr 2016 04:38:07 +0000 Subject: grep-2.25 --- src/Makefile.am | 63 + src/Makefile.in | 1602 ++++++++++++++++++++ src/dfa.c | 4168 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/dfa.h | 119 ++ src/dfasearch.c | 451 ++++++ src/dosbuf.c | 222 +++ src/egrep.sh | 2 + src/grep.c | 2720 ++++++++++++++++++++++++++++++++++ src/grep.h | 34 + src/kwsearch.c | 165 +++ src/kwset.c | 868 +++++++++++ src/kwset.h | 60 + src/pcresearch.c | 389 +++++ src/search.h | 82 ++ src/searchutils.c | 127 ++ src/system.h | 110 ++ 16 files changed, 11182 insertions(+) create mode 100644 src/Makefile.am create mode 100644 src/Makefile.in create mode 100644 src/dfa.c create mode 100644 src/dfa.h create mode 100644 src/dfasearch.c create mode 100644 src/dosbuf.c create mode 100644 src/egrep.sh create mode 100644 src/grep.c create mode 100644 src/grep.h create mode 100644 src/kwsearch.c create mode 100644 src/kwset.c create mode 100644 src/kwset.h create mode 100644 src/pcresearch.c create mode 100644 src/search.h create mode 100644 src/searchutils.c create mode 100644 src/system.h (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..941384e --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,63 @@ +## Process this file with automake to create Makefile.in +# Copyright 1997-1998, 2005-2016 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +LN = ln + +AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS) $(PCRE_CFLAGS) + +# Tell the linker to omit references to unused shared libraries. +AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS) + +bin_PROGRAMS = grep +bin_SCRIPTS = egrep fgrep +grep_SOURCES = grep.c searchutils.c \ + dfa.c dfasearch.c \ + kwset.c kwsearch.c \ + pcresearch.c +noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h + +# Sometimes, the expansion of $(LIBINTL) includes -lc which may +# include modules defining variables like 'optind', so libgreputils.a +# must precede $(LIBINTL) in order to ensure we use GNU getopt. +# But libgreputils.a must also follow $(LIBINTL), since libintl uses +# replacement functions defined in libgreputils.a. +LDADD = \ + ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a $(LIBICONV) \ + $(LIBTHREAD) + +grep_LDADD = $(LDADD) $(PCRE_LIBS) +localedir = $(datadir)/locale +AM_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib + +EXTRA_DIST = dosbuf.c egrep.sh + +egrep fgrep: egrep.sh Makefile + $(AM_V_GEN)grep=`echo grep | sed -e '$(transform)'` && \ + case $@ in egrep) option=-E;; fgrep) option=-F;; esac && \ + shell_does_substrings='set x/y && d=$${1%/*} && test "$$d" = x' && \ + if $(SHELL) -c "$$shell_does_substrings" 2>/dev/null; then \ + edit_substring='s,X,X,'; \ + else \ + edit_substring='s,\$${0%/\*},`expr "X$$0" : '\''X\\(.*\\)/'\''`,g'; \ + fi && \ + sed -e 's|[@]SHELL@|$(SHELL)|g' \ + -e "$$edit_substring" \ + -e "s|[@]grep@|$$grep|g" \ + -e "s|[@]option@|$$option|g" <$(srcdir)/egrep.sh >$@-t + $(AM_V_at)chmod +x $@-t + $(AM_V_at)mv $@-t $@ + +CLEANFILES = egrep fgrep *-t diff --git a/src/Makefile.in b/src/Makefile.in new file mode 100644 index 0000000..02cb0b6 --- /dev/null +++ b/src/Makefile.in @@ -0,0 +1,1602 @@ +# Makefile.in generated by automake 1.99a from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2015 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Copyright 1997-1998, 2005-2016 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +bin_PROGRAMS = grep$(EXEEXT) +subdir = src +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/00gnulib.m4 \ + $(top_srcdir)/m4/absolute-header.m4 $(top_srcdir)/m4/alloca.m4 \ + $(top_srcdir)/m4/btowc.m4 $(top_srcdir)/m4/chdir-long.m4 \ + $(top_srcdir)/m4/close-stream.m4 $(top_srcdir)/m4/close.m4 \ + $(top_srcdir)/m4/closedir.m4 $(top_srcdir)/m4/closeout.m4 \ + $(top_srcdir)/m4/codeset.m4 $(top_srcdir)/m4/configmake.m4 \ + $(top_srcdir)/m4/ctype.m4 $(top_srcdir)/m4/cycle-check.m4 \ + $(top_srcdir)/m4/d-ino.m4 $(top_srcdir)/m4/d-type.m4 \ + $(top_srcdir)/m4/dirent-safer.m4 $(top_srcdir)/m4/dirent_h.m4 \ + $(top_srcdir)/m4/dirfd.m4 $(top_srcdir)/m4/dirname.m4 \ + $(top_srcdir)/m4/double-slash-root.m4 $(top_srcdir)/m4/dup.m4 \ + $(top_srcdir)/m4/dup2.m4 $(top_srcdir)/m4/eealloc.m4 \ + $(top_srcdir)/m4/environ.m4 $(top_srcdir)/m4/errno_h.m4 \ + $(top_srcdir)/m4/error.m4 $(top_srcdir)/m4/exponentd.m4 \ + $(top_srcdir)/m4/extensions.m4 \ + $(top_srcdir)/m4/extern-inline.m4 $(top_srcdir)/m4/fchdir.m4 \ + $(top_srcdir)/m4/fcntl-o.m4 $(top_srcdir)/m4/fcntl-safer.m4 \ + $(top_srcdir)/m4/fcntl.m4 $(top_srcdir)/m4/fcntl_h.m4 \ + $(top_srcdir)/m4/fdopen.m4 $(top_srcdir)/m4/fdopendir.m4 \ + $(top_srcdir)/m4/filenamecat.m4 $(top_srcdir)/m4/flexmember.m4 \ + $(top_srcdir)/m4/float_h.m4 $(top_srcdir)/m4/fnmatch.m4 \ + $(top_srcdir)/m4/fpending.m4 $(top_srcdir)/m4/fpieee.m4 \ + $(top_srcdir)/m4/fstat.m4 $(top_srcdir)/m4/fstatat.m4 \ + $(top_srcdir)/m4/fts.m4 $(top_srcdir)/m4/getcwd.m4 \ + $(top_srcdir)/m4/getdtablesize.m4 $(top_srcdir)/m4/getopt.m4 \ + $(top_srcdir)/m4/getpagesize.m4 $(top_srcdir)/m4/gettext.m4 \ + $(top_srcdir)/m4/gettimeofday.m4 $(top_srcdir)/m4/glibc21.m4 \ + $(top_srcdir)/m4/gnulib-common.m4 \ + $(top_srcdir)/m4/gnulib-comp.m4 \ + $(top_srcdir)/m4/hard-locale.m4 $(top_srcdir)/m4/i-ring.m4 \ + $(top_srcdir)/m4/iconv.m4 $(top_srcdir)/m4/iconv_h.m4 \ + $(top_srcdir)/m4/iconv_open.m4 \ + $(top_srcdir)/m4/include_next.m4 $(top_srcdir)/m4/inline.m4 \ + $(top_srcdir)/m4/intlmacosx.m4 $(top_srcdir)/m4/intmax_t.m4 \ + $(top_srcdir)/m4/inttostr.m4 $(top_srcdir)/m4/inttypes-pri.m4 \ + $(top_srcdir)/m4/inttypes.m4 $(top_srcdir)/m4/inttypes_h.m4 \ + $(top_srcdir)/m4/isatty.m4 $(top_srcdir)/m4/isblank.m4 \ + $(top_srcdir)/m4/iswblank.m4 $(top_srcdir)/m4/iswctype.m4 \ + $(top_srcdir)/m4/langinfo_h.m4 $(top_srcdir)/m4/largefile.m4 \ + $(top_srcdir)/m4/lcmessage.m4 $(top_srcdir)/m4/lib-ld.m4 \ + $(top_srcdir)/m4/lib-link.m4 $(top_srcdir)/m4/lib-prefix.m4 \ + $(top_srcdir)/m4/libunistring-base.m4 \ + $(top_srcdir)/m4/localcharset.m4 $(top_srcdir)/m4/locale-fr.m4 \ + $(top_srcdir)/m4/locale-ja.m4 $(top_srcdir)/m4/locale-tr.m4 \ + $(top_srcdir)/m4/locale-zh.m4 $(top_srcdir)/m4/locale_h.m4 \ + $(top_srcdir)/m4/localeconv.m4 $(top_srcdir)/m4/localename.m4 \ + $(top_srcdir)/m4/lock.m4 $(top_srcdir)/m4/longlong.m4 \ + $(top_srcdir)/m4/lseek.m4 $(top_srcdir)/m4/lstat.m4 \ + $(top_srcdir)/m4/malloc.m4 $(top_srcdir)/m4/malloca.m4 \ + $(top_srcdir)/m4/manywarnings.m4 $(top_srcdir)/m4/mbchar.m4 \ + $(top_srcdir)/m4/mbiter.m4 $(top_srcdir)/m4/mbrlen.m4 \ + $(top_srcdir)/m4/mbrtowc.m4 $(top_srcdir)/m4/mbsinit.m4 \ + $(top_srcdir)/m4/mbslen.m4 $(top_srcdir)/m4/mbsrtowcs.m4 \ + $(top_srcdir)/m4/mbstate_t.m4 $(top_srcdir)/m4/mbtowc.m4 \ + $(top_srcdir)/m4/memchr.m4 $(top_srcdir)/m4/mempcpy.m4 \ + $(top_srcdir)/m4/memrchr.m4 $(top_srcdir)/m4/minmax.m4 \ + $(top_srcdir)/m4/mmap-anon.m4 $(top_srcdir)/m4/mode_t.m4 \ + $(top_srcdir)/m4/msvc-inval.m4 \ + $(top_srcdir)/m4/msvc-nothrow.m4 $(top_srcdir)/m4/multiarch.m4 \ + $(top_srcdir)/m4/nl_langinfo.m4 $(top_srcdir)/m4/nls.m4 \ + $(top_srcdir)/m4/nocrash.m4 $(top_srcdir)/m4/obstack.m4 \ + $(top_srcdir)/m4/off_t.m4 $(top_srcdir)/m4/onceonly.m4 \ + $(top_srcdir)/m4/open.m4 $(top_srcdir)/m4/openat.m4 \ + $(top_srcdir)/m4/opendir.m4 $(top_srcdir)/m4/pathmax.m4 \ + $(top_srcdir)/m4/pcre.m4 $(top_srcdir)/m4/perl.m4 \ + $(top_srcdir)/m4/pipe.m4 $(top_srcdir)/m4/pkg.m4 \ + $(top_srcdir)/m4/po.m4 $(top_srcdir)/m4/printf.m4 \ + $(top_srcdir)/m4/progtest.m4 $(top_srcdir)/m4/putenv.m4 \ + $(top_srcdir)/m4/quote.m4 $(top_srcdir)/m4/quotearg.m4 \ + $(top_srcdir)/m4/read.m4 $(top_srcdir)/m4/readdir.m4 \ + $(top_srcdir)/m4/realloc.m4 $(top_srcdir)/m4/regex.m4 \ + $(top_srcdir)/m4/safe-read.m4 $(top_srcdir)/m4/save-cwd.m4 \ + $(top_srcdir)/m4/setenv.m4 $(top_srcdir)/m4/setlocale.m4 \ + $(top_srcdir)/m4/size_max.m4 $(top_srcdir)/m4/snprintf.m4 \ + $(top_srcdir)/m4/ssize_t.m4 $(top_srcdir)/m4/stat.m4 \ + $(top_srcdir)/m4/stdalign.m4 $(top_srcdir)/m4/stdarg.m4 \ + $(top_srcdir)/m4/stdbool.m4 $(top_srcdir)/m4/stddef_h.m4 \ + $(top_srcdir)/m4/stdint.m4 $(top_srcdir)/m4/stdint_h.m4 \ + $(top_srcdir)/m4/stdio_h.m4 $(top_srcdir)/m4/stdlib_h.m4 \ + $(top_srcdir)/m4/stpcpy.m4 $(top_srcdir)/m4/strdup.m4 \ + $(top_srcdir)/m4/strerror.m4 $(top_srcdir)/m4/string_h.m4 \ + $(top_srcdir)/m4/strnlen.m4 $(top_srcdir)/m4/strstr.m4 \ + $(top_srcdir)/m4/strtoimax.m4 $(top_srcdir)/m4/strtoll.m4 \ + $(top_srcdir)/m4/strtoull.m4 $(top_srcdir)/m4/strtoumax.m4 \ + $(top_srcdir)/m4/symlink.m4 $(top_srcdir)/m4/sys_socket_h.m4 \ + $(top_srcdir)/m4/sys_stat_h.m4 $(top_srcdir)/m4/sys_time_h.m4 \ + $(top_srcdir)/m4/sys_types_h.m4 $(top_srcdir)/m4/threadlib.m4 \ + $(top_srcdir)/m4/time_h.m4 $(top_srcdir)/m4/unistd-safer.m4 \ + $(top_srcdir)/m4/unistd_h.m4 $(top_srcdir)/m4/unlocked-io.m4 \ + $(top_srcdir)/m4/vasnprintf.m4 $(top_srcdir)/m4/version-etc.m4 \ + $(top_srcdir)/m4/warn-on-use.m4 $(top_srcdir)/m4/warnings.m4 \ + $(top_srcdir)/m4/wchar_h.m4 $(top_srcdir)/m4/wchar_t.m4 \ + $(top_srcdir)/m4/wcrtomb.m4 $(top_srcdir)/m4/wctob.m4 \ + $(top_srcdir)/m4/wctomb.m4 $(top_srcdir)/m4/wctype_h.m4 \ + $(top_srcdir)/m4/wcwidth.m4 $(top_srcdir)/m4/wint_t.m4 \ + $(top_srcdir)/m4/xalloc.m4 $(top_srcdir)/m4/xsize.m4 \ + $(top_srcdir)/m4/xstrtol.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" +PROGRAMS = $(bin_PROGRAMS) +am_grep_OBJECTS = grep.$(OBJEXT) searchutils.$(OBJEXT) dfa.$(OBJEXT) \ + dfasearch.$(OBJEXT) kwset.$(OBJEXT) kwsearch.$(OBJEXT) \ + pcresearch.$(OBJEXT) +grep_OBJECTS = $(am_grep_OBJECTS) +am__DEPENDENCIES_1 = +am__DEPENDENCIES_2 = ../lib/libgreputils.a $(am__DEPENDENCIES_1) \ + ../lib/libgreputils.a $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_1) +grep_DEPENDENCIES = $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_1) +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +SCRIPTS = $(bin_SCRIPTS) +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/build-aux/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/dfa.Po ./$(DEPDIR)/dfasearch.Po \ + ./$(DEPDIR)/grep.Po ./$(DEPDIR)/kwsearch.Po \ + ./$(DEPDIR)/kwset.Po ./$(DEPDIR)/pcresearch.Po \ + ./$(DEPDIR)/searchutils.Po +am__mv = mv -f +am__set_depbase = depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.[^.]*$$||'` +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(grep_SOURCES) +DIST_SOURCES = $(grep_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +HEADERS = $(noinst_HEADERS) +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in \ + $(top_srcdir)/build-aux/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +pkglibexecdir = @pkglibexecdir@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +ALLOCA_H = @ALLOCA_H@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +APPLE_UNIVERSAL_BUILD = @APPLE_UNIVERSAL_BUILD@ +AR = @AR@ +ARFLAGS = @ARFLAGS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BITSIZEOF_PTRDIFF_T = @BITSIZEOF_PTRDIFF_T@ +BITSIZEOF_SIG_ATOMIC_T = @BITSIZEOF_SIG_ATOMIC_T@ +BITSIZEOF_SIZE_T = @BITSIZEOF_SIZE_T@ +BITSIZEOF_WCHAR_T = @BITSIZEOF_WCHAR_T@ +BITSIZEOF_WINT_T = @BITSIZEOF_WINT_T@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +COLORIZE_SOURCE = @COLORIZE_SOURCE@ +CONFIG_INCLUDE = @CONFIG_INCLUDE@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EMULTIHOP_HIDDEN = @EMULTIHOP_HIDDEN@ +EMULTIHOP_VALUE = @EMULTIHOP_VALUE@ +ENOLINK_HIDDEN = @ENOLINK_HIDDEN@ +ENOLINK_VALUE = @ENOLINK_VALUE@ +EOVERFLOW_HIDDEN = @EOVERFLOW_HIDDEN@ +EOVERFLOW_VALUE = @EOVERFLOW_VALUE@ +ERRNO_H = @ERRNO_H@ +EXEEXT = @EXEEXT@ +FLOAT_H = @FLOAT_H@ +FNMATCH_H = @FNMATCH_H@ +GETOPT_H = @GETOPT_H@ +GETTEXT_MACRO_VERSION = @GETTEXT_MACRO_VERSION@ +GLIBC21 = @GLIBC21@ +GMSGFMT = @GMSGFMT@ +GMSGFMT_015 = @GMSGFMT_015@ +GNULIB_ALPHASORT = @GNULIB_ALPHASORT@ +GNULIB_ATOLL = @GNULIB_ATOLL@ +GNULIB_BTOWC = @GNULIB_BTOWC@ +GNULIB_CALLOC_POSIX = @GNULIB_CALLOC_POSIX@ +GNULIB_CANONICALIZE_FILE_NAME = @GNULIB_CANONICALIZE_FILE_NAME@ +GNULIB_CHDIR = @GNULIB_CHDIR@ +GNULIB_CHOWN = @GNULIB_CHOWN@ +GNULIB_CLOSE = @GNULIB_CLOSE@ +GNULIB_CLOSEDIR = @GNULIB_CLOSEDIR@ +GNULIB_DIRFD = @GNULIB_DIRFD@ +GNULIB_DPRINTF = @GNULIB_DPRINTF@ +GNULIB_DUP = @GNULIB_DUP@ +GNULIB_DUP2 = @GNULIB_DUP2@ +GNULIB_DUP3 = @GNULIB_DUP3@ +GNULIB_DUPLOCALE = @GNULIB_DUPLOCALE@ +GNULIB_ENVIRON = @GNULIB_ENVIRON@ +GNULIB_EUIDACCESS = @GNULIB_EUIDACCESS@ +GNULIB_FACCESSAT = @GNULIB_FACCESSAT@ +GNULIB_FCHDIR = @GNULIB_FCHDIR@ +GNULIB_FCHMODAT = @GNULIB_FCHMODAT@ +GNULIB_FCHOWNAT = @GNULIB_FCHOWNAT@ +GNULIB_FCLOSE = @GNULIB_FCLOSE@ +GNULIB_FCNTL = @GNULIB_FCNTL@ +GNULIB_FDATASYNC = @GNULIB_FDATASYNC@ +GNULIB_FDOPEN = @GNULIB_FDOPEN@ +GNULIB_FDOPENDIR = @GNULIB_FDOPENDIR@ +GNULIB_FFLUSH = @GNULIB_FFLUSH@ +GNULIB_FFSL = @GNULIB_FFSL@ +GNULIB_FFSLL = @GNULIB_FFSLL@ +GNULIB_FGETC = @GNULIB_FGETC@ +GNULIB_FGETS = @GNULIB_FGETS@ +GNULIB_FOPEN = @GNULIB_FOPEN@ +GNULIB_FPRINTF = @GNULIB_FPRINTF@ +GNULIB_FPRINTF_POSIX = @GNULIB_FPRINTF_POSIX@ +GNULIB_FPURGE = @GNULIB_FPURGE@ +GNULIB_FPUTC = @GNULIB_FPUTC@ +GNULIB_FPUTS = @GNULIB_FPUTS@ +GNULIB_FREAD = @GNULIB_FREAD@ +GNULIB_FREOPEN = @GNULIB_FREOPEN@ +GNULIB_FSCANF = @GNULIB_FSCANF@ +GNULIB_FSEEK = @GNULIB_FSEEK@ +GNULIB_FSEEKO = @GNULIB_FSEEKO@ +GNULIB_FSTAT = @GNULIB_FSTAT@ +GNULIB_FSTATAT = @GNULIB_FSTATAT@ +GNULIB_FSYNC = @GNULIB_FSYNC@ +GNULIB_FTELL = @GNULIB_FTELL@ +GNULIB_FTELLO = @GNULIB_FTELLO@ +GNULIB_FTRUNCATE = @GNULIB_FTRUNCATE@ +GNULIB_FUTIMENS = @GNULIB_FUTIMENS@ +GNULIB_FWRITE = @GNULIB_FWRITE@ +GNULIB_GETC = @GNULIB_GETC@ +GNULIB_GETCHAR = @GNULIB_GETCHAR@ +GNULIB_GETCWD = @GNULIB_GETCWD@ +GNULIB_GETDELIM = @GNULIB_GETDELIM@ +GNULIB_GETDOMAINNAME = @GNULIB_GETDOMAINNAME@ +GNULIB_GETDTABLESIZE = @GNULIB_GETDTABLESIZE@ +GNULIB_GETGROUPS = @GNULIB_GETGROUPS@ +GNULIB_GETHOSTNAME = @GNULIB_GETHOSTNAME@ +GNULIB_GETLINE = @GNULIB_GETLINE@ +GNULIB_GETLOADAVG = @GNULIB_GETLOADAVG@ +GNULIB_GETLOGIN = @GNULIB_GETLOGIN@ +GNULIB_GETLOGIN_R = @GNULIB_GETLOGIN_R@ +GNULIB_GETPAGESIZE = @GNULIB_GETPAGESIZE@ +GNULIB_GETSUBOPT = @GNULIB_GETSUBOPT@ +GNULIB_GETTIMEOFDAY = @GNULIB_GETTIMEOFDAY@ +GNULIB_GETUSERSHELL = @GNULIB_GETUSERSHELL@ +GNULIB_GL_UNISTD_H_GETOPT = @GNULIB_GL_UNISTD_H_GETOPT@ +GNULIB_GRANTPT = @GNULIB_GRANTPT@ +GNULIB_GROUP_MEMBER = @GNULIB_GROUP_MEMBER@ +GNULIB_ICONV = @GNULIB_ICONV@ +GNULIB_IMAXABS = @GNULIB_IMAXABS@ +GNULIB_IMAXDIV = @GNULIB_IMAXDIV@ +GNULIB_ISATTY = @GNULIB_ISATTY@ +GNULIB_ISBLANK = @GNULIB_ISBLANK@ +GNULIB_ISWBLANK = @GNULIB_ISWBLANK@ +GNULIB_ISWCTYPE = @GNULIB_ISWCTYPE@ +GNULIB_LCHMOD = @GNULIB_LCHMOD@ +GNULIB_LCHOWN = @GNULIB_LCHOWN@ +GNULIB_LINK = @GNULIB_LINK@ +GNULIB_LINKAT = @GNULIB_LINKAT@ +GNULIB_LOCALECONV = @GNULIB_LOCALECONV@ +GNULIB_LSEEK = @GNULIB_LSEEK@ +GNULIB_LSTAT = @GNULIB_LSTAT@ +GNULIB_MALLOC_POSIX = @GNULIB_MALLOC_POSIX@ +GNULIB_MBRLEN = @GNULIB_MBRLEN@ +GNULIB_MBRTOWC = @GNULIB_MBRTOWC@ +GNULIB_MBSCASECMP = @GNULIB_MBSCASECMP@ +GNULIB_MBSCASESTR = @GNULIB_MBSCASESTR@ +GNULIB_MBSCHR = @GNULIB_MBSCHR@ +GNULIB_MBSCSPN = @GNULIB_MBSCSPN@ +GNULIB_MBSINIT = @GNULIB_MBSINIT@ +GNULIB_MBSLEN = @GNULIB_MBSLEN@ +GNULIB_MBSNCASECMP = @GNULIB_MBSNCASECMP@ +GNULIB_MBSNLEN = @GNULIB_MBSNLEN@ +GNULIB_MBSNRTOWCS = @GNULIB_MBSNRTOWCS@ +GNULIB_MBSPBRK = @GNULIB_MBSPBRK@ +GNULIB_MBSPCASECMP = @GNULIB_MBSPCASECMP@ +GNULIB_MBSRCHR = @GNULIB_MBSRCHR@ +GNULIB_MBSRTOWCS = @GNULIB_MBSRTOWCS@ +GNULIB_MBSSEP = @GNULIB_MBSSEP@ +GNULIB_MBSSPN = @GNULIB_MBSSPN@ +GNULIB_MBSSTR = @GNULIB_MBSSTR@ +GNULIB_MBSTOK_R = @GNULIB_MBSTOK_R@ +GNULIB_MBTOWC = @GNULIB_MBTOWC@ +GNULIB_MEMCHR = @GNULIB_MEMCHR@ +GNULIB_MEMMEM = @GNULIB_MEMMEM@ +GNULIB_MEMPCPY = @GNULIB_MEMPCPY@ +GNULIB_MEMRCHR = @GNULIB_MEMRCHR@ +GNULIB_MKDIRAT = @GNULIB_MKDIRAT@ +GNULIB_MKDTEMP = @GNULIB_MKDTEMP@ +GNULIB_MKFIFO = @GNULIB_MKFIFO@ +GNULIB_MKFIFOAT = @GNULIB_MKFIFOAT@ +GNULIB_MKNOD = @GNULIB_MKNOD@ +GNULIB_MKNODAT = @GNULIB_MKNODAT@ +GNULIB_MKOSTEMP = @GNULIB_MKOSTEMP@ +GNULIB_MKOSTEMPS = @GNULIB_MKOSTEMPS@ +GNULIB_MKSTEMP = @GNULIB_MKSTEMP@ +GNULIB_MKSTEMPS = @GNULIB_MKSTEMPS@ +GNULIB_MKTIME = @GNULIB_MKTIME@ +GNULIB_NANOSLEEP = @GNULIB_NANOSLEEP@ +GNULIB_NL_LANGINFO = @GNULIB_NL_LANGINFO@ +GNULIB_NONBLOCKING = @GNULIB_NONBLOCKING@ +GNULIB_OBSTACK_PRINTF = @GNULIB_OBSTACK_PRINTF@ +GNULIB_OBSTACK_PRINTF_POSIX = @GNULIB_OBSTACK_PRINTF_POSIX@ +GNULIB_OPEN = @GNULIB_OPEN@ +GNULIB_OPENAT = @GNULIB_OPENAT@ +GNULIB_OPENDIR = @GNULIB_OPENDIR@ +GNULIB_PCLOSE = @GNULIB_PCLOSE@ +GNULIB_PERROR = @GNULIB_PERROR@ +GNULIB_PIPE = @GNULIB_PIPE@ +GNULIB_PIPE2 = @GNULIB_PIPE2@ +GNULIB_POPEN = @GNULIB_POPEN@ +GNULIB_POSIX_OPENPT = @GNULIB_POSIX_OPENPT@ +GNULIB_PREAD = @GNULIB_PREAD@ +GNULIB_PRINTF = @GNULIB_PRINTF@ +GNULIB_PRINTF_POSIX = @GNULIB_PRINTF_POSIX@ +GNULIB_PTSNAME = @GNULIB_PTSNAME@ +GNULIB_PTSNAME_R = @GNULIB_PTSNAME_R@ +GNULIB_PUTC = @GNULIB_PUTC@ +GNULIB_PUTCHAR = @GNULIB_PUTCHAR@ +GNULIB_PUTENV = @GNULIB_PUTENV@ +GNULIB_PUTS = @GNULIB_PUTS@ +GNULIB_PWRITE = @GNULIB_PWRITE@ +GNULIB_QSORT_R = @GNULIB_QSORT_R@ +GNULIB_RANDOM = @GNULIB_RANDOM@ +GNULIB_RANDOM_R = @GNULIB_RANDOM_R@ +GNULIB_RAWMEMCHR = @GNULIB_RAWMEMCHR@ +GNULIB_READ = @GNULIB_READ@ +GNULIB_READDIR = @GNULIB_READDIR@ +GNULIB_READLINK = @GNULIB_READLINK@ +GNULIB_READLINKAT = @GNULIB_READLINKAT@ +GNULIB_REALLOC_POSIX = @GNULIB_REALLOC_POSIX@ +GNULIB_REALPATH = @GNULIB_REALPATH@ +GNULIB_REMOVE = @GNULIB_REMOVE@ +GNULIB_RENAME = @GNULIB_RENAME@ +GNULIB_RENAMEAT = @GNULIB_RENAMEAT@ +GNULIB_REWINDDIR = @GNULIB_REWINDDIR@ +GNULIB_RMDIR = @GNULIB_RMDIR@ +GNULIB_RPMATCH = @GNULIB_RPMATCH@ +GNULIB_SCANDIR = @GNULIB_SCANDIR@ +GNULIB_SCANF = @GNULIB_SCANF@ +GNULIB_SECURE_GETENV = @GNULIB_SECURE_GETENV@ +GNULIB_SETENV = @GNULIB_SETENV@ +GNULIB_SETHOSTNAME = @GNULIB_SETHOSTNAME@ +GNULIB_SETLOCALE = @GNULIB_SETLOCALE@ +GNULIB_SLEEP = @GNULIB_SLEEP@ +GNULIB_SNPRINTF = @GNULIB_SNPRINTF@ +GNULIB_SPRINTF_POSIX = @GNULIB_SPRINTF_POSIX@ +GNULIB_STAT = @GNULIB_STAT@ +GNULIB_STDIO_H_NONBLOCKING = @GNULIB_STDIO_H_NONBLOCKING@ +GNULIB_STDIO_H_SIGPIPE = @GNULIB_STDIO_H_SIGPIPE@ +GNULIB_STPCPY = @GNULIB_STPCPY@ +GNULIB_STPNCPY = @GNULIB_STPNCPY@ +GNULIB_STRCASESTR = @GNULIB_STRCASESTR@ +GNULIB_STRCHRNUL = @GNULIB_STRCHRNUL@ +GNULIB_STRDUP = @GNULIB_STRDUP@ +GNULIB_STRERROR = @GNULIB_STRERROR@ +GNULIB_STRERROR_R = @GNULIB_STRERROR_R@ +GNULIB_STRNCAT = @GNULIB_STRNCAT@ +GNULIB_STRNDUP = @GNULIB_STRNDUP@ +GNULIB_STRNLEN = @GNULIB_STRNLEN@ +GNULIB_STRPBRK = @GNULIB_STRPBRK@ +GNULIB_STRPTIME = @GNULIB_STRPTIME@ +GNULIB_STRSEP = @GNULIB_STRSEP@ +GNULIB_STRSIGNAL = @GNULIB_STRSIGNAL@ +GNULIB_STRSTR = @GNULIB_STRSTR@ +GNULIB_STRTOD = @GNULIB_STRTOD@ +GNULIB_STRTOIMAX = @GNULIB_STRTOIMAX@ +GNULIB_STRTOK_R = @GNULIB_STRTOK_R@ +GNULIB_STRTOLL = @GNULIB_STRTOLL@ +GNULIB_STRTOULL = @GNULIB_STRTOULL@ +GNULIB_STRTOUMAX = @GNULIB_STRTOUMAX@ +GNULIB_STRVERSCMP = @GNULIB_STRVERSCMP@ +GNULIB_SYMLINK = @GNULIB_SYMLINK@ +GNULIB_SYMLINKAT = @GNULIB_SYMLINKAT@ +GNULIB_SYSTEM_POSIX = @GNULIB_SYSTEM_POSIX@ +GNULIB_TIMEGM = @GNULIB_TIMEGM@ +GNULIB_TIME_R = @GNULIB_TIME_R@ +GNULIB_TIME_RZ = @GNULIB_TIME_RZ@ +GNULIB_TMPFILE = @GNULIB_TMPFILE@ +GNULIB_TOWCTRANS = @GNULIB_TOWCTRANS@ +GNULIB_TTYNAME_R = @GNULIB_TTYNAME_R@ +GNULIB_UNISTD_H_NONBLOCKING = @GNULIB_UNISTD_H_NONBLOCKING@ +GNULIB_UNISTD_H_SIGPIPE = @GNULIB_UNISTD_H_SIGPIPE@ +GNULIB_UNLINK = @GNULIB_UNLINK@ +GNULIB_UNLINKAT = @GNULIB_UNLINKAT@ +GNULIB_UNLOCKPT = @GNULIB_UNLOCKPT@ +GNULIB_UNSETENV = @GNULIB_UNSETENV@ +GNULIB_USLEEP = @GNULIB_USLEEP@ +GNULIB_UTIMENSAT = @GNULIB_UTIMENSAT@ +GNULIB_VASPRINTF = @GNULIB_VASPRINTF@ +GNULIB_VDPRINTF = @GNULIB_VDPRINTF@ +GNULIB_VFPRINTF = @GNULIB_VFPRINTF@ +GNULIB_VFPRINTF_POSIX = @GNULIB_VFPRINTF_POSIX@ +GNULIB_VFSCANF = @GNULIB_VFSCANF@ +GNULIB_VPRINTF = @GNULIB_VPRINTF@ +GNULIB_VPRINTF_POSIX = @GNULIB_VPRINTF_POSIX@ +GNULIB_VSCANF = @GNULIB_VSCANF@ +GNULIB_VSNPRINTF = @GNULIB_VSNPRINTF@ +GNULIB_VSPRINTF_POSIX = @GNULIB_VSPRINTF_POSIX@ +GNULIB_WARN_CFLAGS = @GNULIB_WARN_CFLAGS@ +GNULIB_WCPCPY = @GNULIB_WCPCPY@ +GNULIB_WCPNCPY = @GNULIB_WCPNCPY@ +GNULIB_WCRTOMB = @GNULIB_WCRTOMB@ +GNULIB_WCSCASECMP = @GNULIB_WCSCASECMP@ +GNULIB_WCSCAT = @GNULIB_WCSCAT@ +GNULIB_WCSCHR = @GNULIB_WCSCHR@ +GNULIB_WCSCMP = @GNULIB_WCSCMP@ +GNULIB_WCSCOLL = @GNULIB_WCSCOLL@ +GNULIB_WCSCPY = @GNULIB_WCSCPY@ +GNULIB_WCSCSPN = @GNULIB_WCSCSPN@ +GNULIB_WCSDUP = @GNULIB_WCSDUP@ +GNULIB_WCSLEN = @GNULIB_WCSLEN@ +GNULIB_WCSNCASECMP = @GNULIB_WCSNCASECMP@ +GNULIB_WCSNCAT = @GNULIB_WCSNCAT@ +GNULIB_WCSNCMP = @GNULIB_WCSNCMP@ +GNULIB_WCSNCPY = @GNULIB_WCSNCPY@ +GNULIB_WCSNLEN = @GNULIB_WCSNLEN@ +GNULIB_WCSNRTOMBS = @GNULIB_WCSNRTOMBS@ +GNULIB_WCSPBRK = @GNULIB_WCSPBRK@ +GNULIB_WCSRCHR = @GNULIB_WCSRCHR@ +GNULIB_WCSRTOMBS = @GNULIB_WCSRTOMBS@ +GNULIB_WCSSPN = @GNULIB_WCSSPN@ +GNULIB_WCSSTR = @GNULIB_WCSSTR@ +GNULIB_WCSTOK = @GNULIB_WCSTOK@ +GNULIB_WCSWIDTH = @GNULIB_WCSWIDTH@ +GNULIB_WCSXFRM = @GNULIB_WCSXFRM@ +GNULIB_WCTOB = @GNULIB_WCTOB@ +GNULIB_WCTOMB = @GNULIB_WCTOMB@ +GNULIB_WCTRANS = @GNULIB_WCTRANS@ +GNULIB_WCTYPE = @GNULIB_WCTYPE@ +GNULIB_WCWIDTH = @GNULIB_WCWIDTH@ +GNULIB_WMEMCHR = @GNULIB_WMEMCHR@ +GNULIB_WMEMCMP = @GNULIB_WMEMCMP@ +GNULIB_WMEMCPY = @GNULIB_WMEMCPY@ +GNULIB_WMEMMOVE = @GNULIB_WMEMMOVE@ +GNULIB_WMEMSET = @GNULIB_WMEMSET@ +GNULIB_WRITE = @GNULIB_WRITE@ +GNULIB__EXIT = @GNULIB__EXIT@ +GREP = @GREP@ +HAVE_ALPHASORT = @HAVE_ALPHASORT@ +HAVE_ATOLL = @HAVE_ATOLL@ +HAVE_BTOWC = @HAVE_BTOWC@ +HAVE_CANONICALIZE_FILE_NAME = @HAVE_CANONICALIZE_FILE_NAME@ +HAVE_CHOWN = @HAVE_CHOWN@ +HAVE_CLOSEDIR = @HAVE_CLOSEDIR@ +HAVE_DECL_DIRFD = @HAVE_DECL_DIRFD@ +HAVE_DECL_ENVIRON = @HAVE_DECL_ENVIRON@ +HAVE_DECL_FCHDIR = @HAVE_DECL_FCHDIR@ +HAVE_DECL_FDATASYNC = @HAVE_DECL_FDATASYNC@ +HAVE_DECL_FDOPENDIR = @HAVE_DECL_FDOPENDIR@ +HAVE_DECL_FPURGE = @HAVE_DECL_FPURGE@ +HAVE_DECL_FSEEKO = @HAVE_DECL_FSEEKO@ +HAVE_DECL_FTELLO = @HAVE_DECL_FTELLO@ +HAVE_DECL_GETDELIM = @HAVE_DECL_GETDELIM@ +HAVE_DECL_GETDOMAINNAME = @HAVE_DECL_GETDOMAINNAME@ +HAVE_DECL_GETLINE = @HAVE_DECL_GETLINE@ +HAVE_DECL_GETLOADAVG = @HAVE_DECL_GETLOADAVG@ +HAVE_DECL_GETLOGIN_R = @HAVE_DECL_GETLOGIN_R@ +HAVE_DECL_GETPAGESIZE = @HAVE_DECL_GETPAGESIZE@ +HAVE_DECL_GETUSERSHELL = @HAVE_DECL_GETUSERSHELL@ +HAVE_DECL_IMAXABS = @HAVE_DECL_IMAXABS@ +HAVE_DECL_IMAXDIV = @HAVE_DECL_IMAXDIV@ +HAVE_DECL_LOCALTIME_R = @HAVE_DECL_LOCALTIME_R@ +HAVE_DECL_MEMMEM = @HAVE_DECL_MEMMEM@ +HAVE_DECL_MEMRCHR = @HAVE_DECL_MEMRCHR@ +HAVE_DECL_OBSTACK_PRINTF = @HAVE_DECL_OBSTACK_PRINTF@ +HAVE_DECL_SETENV = @HAVE_DECL_SETENV@ +HAVE_DECL_SETHOSTNAME = @HAVE_DECL_SETHOSTNAME@ +HAVE_DECL_SNPRINTF = @HAVE_DECL_SNPRINTF@ +HAVE_DECL_STRDUP = @HAVE_DECL_STRDUP@ +HAVE_DECL_STRERROR_R = @HAVE_DECL_STRERROR_R@ +HAVE_DECL_STRNDUP = @HAVE_DECL_STRNDUP@ +HAVE_DECL_STRNLEN = @HAVE_DECL_STRNLEN@ +HAVE_DECL_STRSIGNAL = @HAVE_DECL_STRSIGNAL@ +HAVE_DECL_STRTOIMAX = @HAVE_DECL_STRTOIMAX@ +HAVE_DECL_STRTOK_R = @HAVE_DECL_STRTOK_R@ +HAVE_DECL_STRTOUMAX = @HAVE_DECL_STRTOUMAX@ +HAVE_DECL_TTYNAME_R = @HAVE_DECL_TTYNAME_R@ +HAVE_DECL_UNSETENV = @HAVE_DECL_UNSETENV@ +HAVE_DECL_VSNPRINTF = @HAVE_DECL_VSNPRINTF@ +HAVE_DECL_WCTOB = @HAVE_DECL_WCTOB@ +HAVE_DECL_WCWIDTH = @HAVE_DECL_WCWIDTH@ +HAVE_DIRENT_H = @HAVE_DIRENT_H@ +HAVE_DPRINTF = @HAVE_DPRINTF@ +HAVE_DUP2 = @HAVE_DUP2@ +HAVE_DUP3 = @HAVE_DUP3@ +HAVE_DUPLOCALE = @HAVE_DUPLOCALE@ +HAVE_EUIDACCESS = @HAVE_EUIDACCESS@ +HAVE_FACCESSAT = @HAVE_FACCESSAT@ +HAVE_FCHDIR = @HAVE_FCHDIR@ +HAVE_FCHMODAT = @HAVE_FCHMODAT@ +HAVE_FCHOWNAT = @HAVE_FCHOWNAT@ +HAVE_FCNTL = @HAVE_FCNTL@ +HAVE_FDATASYNC = @HAVE_FDATASYNC@ +HAVE_FDOPENDIR = @HAVE_FDOPENDIR@ +HAVE_FEATURES_H = @HAVE_FEATURES_H@ +HAVE_FFSL = @HAVE_FFSL@ +HAVE_FFSLL = @HAVE_FFSLL@ +HAVE_FSEEKO = @HAVE_FSEEKO@ +HAVE_FSTATAT = @HAVE_FSTATAT@ +HAVE_FSYNC = @HAVE_FSYNC@ +HAVE_FTELLO = @HAVE_FTELLO@ +HAVE_FTRUNCATE = @HAVE_FTRUNCATE@ +HAVE_FUTIMENS = @HAVE_FUTIMENS@ +HAVE_GETDTABLESIZE = @HAVE_GETDTABLESIZE@ +HAVE_GETGROUPS = @HAVE_GETGROUPS@ +HAVE_GETHOSTNAME = @HAVE_GETHOSTNAME@ +HAVE_GETLOGIN = @HAVE_GETLOGIN@ +HAVE_GETOPT_H = @HAVE_GETOPT_H@ +HAVE_GETPAGESIZE = @HAVE_GETPAGESIZE@ +HAVE_GETSUBOPT = @HAVE_GETSUBOPT@ +HAVE_GETTIMEOFDAY = @HAVE_GETTIMEOFDAY@ +HAVE_GRANTPT = @HAVE_GRANTPT@ +HAVE_GROUP_MEMBER = @HAVE_GROUP_MEMBER@ +HAVE_INTTYPES_H = @HAVE_INTTYPES_H@ +HAVE_ISBLANK = @HAVE_ISBLANK@ +HAVE_ISWBLANK = @HAVE_ISWBLANK@ +HAVE_ISWCNTRL = @HAVE_ISWCNTRL@ +HAVE_LANGINFO_CODESET = @HAVE_LANGINFO_CODESET@ +HAVE_LANGINFO_ERA = @HAVE_LANGINFO_ERA@ +HAVE_LANGINFO_H = @HAVE_LANGINFO_H@ +HAVE_LANGINFO_T_FMT_AMPM = @HAVE_LANGINFO_T_FMT_AMPM@ +HAVE_LANGINFO_YESEXPR = @HAVE_LANGINFO_YESEXPR@ +HAVE_LCHMOD = @HAVE_LCHMOD@ +HAVE_LCHOWN = @HAVE_LCHOWN@ +HAVE_LINK = @HAVE_LINK@ +HAVE_LINKAT = @HAVE_LINKAT@ +HAVE_LONG_LONG_INT = @HAVE_LONG_LONG_INT@ +HAVE_LSTAT = @HAVE_LSTAT@ +HAVE_MAX_ALIGN_T = @HAVE_MAX_ALIGN_T@ +HAVE_MBRLEN = @HAVE_MBRLEN@ +HAVE_MBRTOWC = @HAVE_MBRTOWC@ +HAVE_MBSINIT = @HAVE_MBSINIT@ +HAVE_MBSLEN = @HAVE_MBSLEN@ +HAVE_MBSNRTOWCS = @HAVE_MBSNRTOWCS@ +HAVE_MBSRTOWCS = @HAVE_MBSRTOWCS@ +HAVE_MEMCHR = @HAVE_MEMCHR@ +HAVE_MEMPCPY = @HAVE_MEMPCPY@ +HAVE_MKDIRAT = @HAVE_MKDIRAT@ +HAVE_MKDTEMP = @HAVE_MKDTEMP@ +HAVE_MKFIFO = @HAVE_MKFIFO@ +HAVE_MKFIFOAT = @HAVE_MKFIFOAT@ +HAVE_MKNOD = @HAVE_MKNOD@ +HAVE_MKNODAT = @HAVE_MKNODAT@ +HAVE_MKOSTEMP = @HAVE_MKOSTEMP@ +HAVE_MKOSTEMPS = @HAVE_MKOSTEMPS@ +HAVE_MKSTEMP = @HAVE_MKSTEMP@ +HAVE_MKSTEMPS = @HAVE_MKSTEMPS@ +HAVE_MSVC_INVALID_PARAMETER_HANDLER = @HAVE_MSVC_INVALID_PARAMETER_HANDLER@ +HAVE_NANOSLEEP = @HAVE_NANOSLEEP@ +HAVE_NL_LANGINFO = @HAVE_NL_LANGINFO@ +HAVE_OPENAT = @HAVE_OPENAT@ +HAVE_OPENDIR = @HAVE_OPENDIR@ +HAVE_OS_H = @HAVE_OS_H@ +HAVE_PCLOSE = @HAVE_PCLOSE@ +HAVE_PIPE = @HAVE_PIPE@ +HAVE_PIPE2 = @HAVE_PIPE2@ +HAVE_POPEN = @HAVE_POPEN@ +HAVE_POSIX_OPENPT = @HAVE_POSIX_OPENPT@ +HAVE_PREAD = @HAVE_PREAD@ +HAVE_PTSNAME = @HAVE_PTSNAME@ +HAVE_PTSNAME_R = @HAVE_PTSNAME_R@ +HAVE_PWRITE = @HAVE_PWRITE@ +HAVE_RANDOM = @HAVE_RANDOM@ +HAVE_RANDOM_H = @HAVE_RANDOM_H@ +HAVE_RANDOM_R = @HAVE_RANDOM_R@ +HAVE_RAWMEMCHR = @HAVE_RAWMEMCHR@ +HAVE_READDIR = @HAVE_READDIR@ +HAVE_READLINK = @HAVE_READLINK@ +HAVE_READLINKAT = @HAVE_READLINKAT@ +HAVE_REALPATH = @HAVE_REALPATH@ +HAVE_RENAMEAT = @HAVE_RENAMEAT@ +HAVE_REWINDDIR = @HAVE_REWINDDIR@ +HAVE_RPMATCH = @HAVE_RPMATCH@ +HAVE_SCANDIR = @HAVE_SCANDIR@ +HAVE_SECURE_GETENV = @HAVE_SECURE_GETENV@ +HAVE_SETENV = @HAVE_SETENV@ +HAVE_SETHOSTNAME = @HAVE_SETHOSTNAME@ +HAVE_SIGNED_SIG_ATOMIC_T = @HAVE_SIGNED_SIG_ATOMIC_T@ +HAVE_SIGNED_WCHAR_T = @HAVE_SIGNED_WCHAR_T@ +HAVE_SIGNED_WINT_T = @HAVE_SIGNED_WINT_T@ +HAVE_SLEEP = @HAVE_SLEEP@ +HAVE_STDINT_H = @HAVE_STDINT_H@ +HAVE_STPCPY = @HAVE_STPCPY@ +HAVE_STPNCPY = @HAVE_STPNCPY@ +HAVE_STRCASESTR = @HAVE_STRCASESTR@ +HAVE_STRCHRNUL = @HAVE_STRCHRNUL@ +HAVE_STRPBRK = @HAVE_STRPBRK@ +HAVE_STRPTIME = @HAVE_STRPTIME@ +HAVE_STRSEP = @HAVE_STRSEP@ +HAVE_STRTOD = @HAVE_STRTOD@ +HAVE_STRTOLL = @HAVE_STRTOLL@ +HAVE_STRTOULL = @HAVE_STRTOULL@ +HAVE_STRUCT_RANDOM_DATA = @HAVE_STRUCT_RANDOM_DATA@ +HAVE_STRUCT_TIMEVAL = @HAVE_STRUCT_TIMEVAL@ +HAVE_STRVERSCMP = @HAVE_STRVERSCMP@ +HAVE_SYMLINK = @HAVE_SYMLINK@ +HAVE_SYMLINKAT = @HAVE_SYMLINKAT@ +HAVE_SYS_BITYPES_H = @HAVE_SYS_BITYPES_H@ +HAVE_SYS_INTTYPES_H = @HAVE_SYS_INTTYPES_H@ +HAVE_SYS_LOADAVG_H = @HAVE_SYS_LOADAVG_H@ +HAVE_SYS_PARAM_H = @HAVE_SYS_PARAM_H@ +HAVE_SYS_TIME_H = @HAVE_SYS_TIME_H@ +HAVE_SYS_TYPES_H = @HAVE_SYS_TYPES_H@ +HAVE_TIMEGM = @HAVE_TIMEGM@ +HAVE_TIMEZONE_T = @HAVE_TIMEZONE_T@ +HAVE_UNISTD_H = @HAVE_UNISTD_H@ +HAVE_UNLINKAT = @HAVE_UNLINKAT@ +HAVE_UNLOCKPT = @HAVE_UNLOCKPT@ +HAVE_UNSIGNED_LONG_LONG_INT = @HAVE_UNSIGNED_LONG_LONG_INT@ +HAVE_USLEEP = @HAVE_USLEEP@ +HAVE_UTIMENSAT = @HAVE_UTIMENSAT@ +HAVE_VASPRINTF = @HAVE_VASPRINTF@ +HAVE_VDPRINTF = @HAVE_VDPRINTF@ +HAVE_WCHAR_H = @HAVE_WCHAR_H@ +HAVE_WCHAR_T = @HAVE_WCHAR_T@ +HAVE_WCPCPY = @HAVE_WCPCPY@ +HAVE_WCPNCPY = @HAVE_WCPNCPY@ +HAVE_WCRTOMB = @HAVE_WCRTOMB@ +HAVE_WCSCASECMP = @HAVE_WCSCASECMP@ +HAVE_WCSCAT = @HAVE_WCSCAT@ +HAVE_WCSCHR = @HAVE_WCSCHR@ +HAVE_WCSCMP = @HAVE_WCSCMP@ +HAVE_WCSCOLL = @HAVE_WCSCOLL@ +HAVE_WCSCPY = @HAVE_WCSCPY@ +HAVE_WCSCSPN = @HAVE_WCSCSPN@ +HAVE_WCSDUP = @HAVE_WCSDUP@ +HAVE_WCSLEN = @HAVE_WCSLEN@ +HAVE_WCSNCASECMP = @HAVE_WCSNCASECMP@ +HAVE_WCSNCAT = @HAVE_WCSNCAT@ +HAVE_WCSNCMP = @HAVE_WCSNCMP@ +HAVE_WCSNCPY = @HAVE_WCSNCPY@ +HAVE_WCSNLEN = @HAVE_WCSNLEN@ +HAVE_WCSNRTOMBS = @HAVE_WCSNRTOMBS@ +HAVE_WCSPBRK = @HAVE_WCSPBRK@ +HAVE_WCSRCHR = @HAVE_WCSRCHR@ +HAVE_WCSRTOMBS = @HAVE_WCSRTOMBS@ +HAVE_WCSSPN = @HAVE_WCSSPN@ +HAVE_WCSSTR = @HAVE_WCSSTR@ +HAVE_WCSTOK = @HAVE_WCSTOK@ +HAVE_WCSWIDTH = @HAVE_WCSWIDTH@ +HAVE_WCSXFRM = @HAVE_WCSXFRM@ +HAVE_WCTRANS_T = @HAVE_WCTRANS_T@ +HAVE_WCTYPE_H = @HAVE_WCTYPE_H@ +HAVE_WCTYPE_T = @HAVE_WCTYPE_T@ +HAVE_WINSOCK2_H = @HAVE_WINSOCK2_H@ +HAVE_WINT_T = @HAVE_WINT_T@ +HAVE_WMEMCHR = @HAVE_WMEMCHR@ +HAVE_WMEMCMP = @HAVE_WMEMCMP@ +HAVE_WMEMCPY = @HAVE_WMEMCPY@ +HAVE_WMEMMOVE = @HAVE_WMEMMOVE@ +HAVE_WMEMSET = @HAVE_WMEMSET@ +HAVE_XLOCALE_H = @HAVE_XLOCALE_H@ +HAVE__BOOL = @HAVE__BOOL@ +HAVE__EXIT = @HAVE__EXIT@ +ICONV_CONST = @ICONV_CONST@ +ICONV_H = @ICONV_H@ +INCLUDE_NEXT = @INCLUDE_NEXT@ +INCLUDE_NEXT_AS_FIRST_DIRECTIVE = @INCLUDE_NEXT_AS_FIRST_DIRECTIVE@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +INT32_MAX_LT_INTMAX_MAX = @INT32_MAX_LT_INTMAX_MAX@ +INT64_MAX_EQ_LONG_MAX = @INT64_MAX_EQ_LONG_MAX@ +INTLLIBS = @INTLLIBS@ +INTL_MACOSX_LIBS = @INTL_MACOSX_LIBS@ +LDFLAGS = @LDFLAGS@ +LIBGREPUTILS_LIBDEPS = @LIBGREPUTILS_LIBDEPS@ +LIBGREPUTILS_LTLIBDEPS = @LIBGREPUTILS_LTLIBDEPS@ +LIBICONV = @LIBICONV@ +LIBINTL = @LIBINTL@ +LIBMULTITHREAD = @LIBMULTITHREAD@ +LIBOBJS = @LIBOBJS@ +LIBPTH = @LIBPTH@ +LIBPTH_PREFIX = @LIBPTH_PREFIX@ +LIBS = @LIBS@ +LIBTESTS_LIBDEPS = @LIBTESTS_LIBDEPS@ +LIBTHREAD = @LIBTHREAD@ +LIBUNISTRING_UNISTR_H = @LIBUNISTRING_UNISTR_H@ +LIBUNISTRING_UNITYPES_H = @LIBUNISTRING_UNITYPES_H@ +LIBUNISTRING_UNIWIDTH_H = @LIBUNISTRING_UNIWIDTH_H@ +LOCALCHARSET_TESTS_ENVIRONMENT = @LOCALCHARSET_TESTS_ENVIRONMENT@ +LOCALE_FR = @LOCALE_FR@ +LOCALE_FR_UTF8 = @LOCALE_FR_UTF8@ +LOCALE_JA = @LOCALE_JA@ +LOCALE_TR_UTF8 = @LOCALE_TR_UTF8@ +LOCALE_ZH_CN = @LOCALE_ZH_CN@ +LTLIBICONV = @LTLIBICONV@ +LTLIBINTL = @LTLIBINTL@ +LTLIBMULTITHREAD = @LTLIBMULTITHREAD@ +LTLIBOBJS = @LTLIBOBJS@ +LTLIBPTH = @LTLIBPTH@ +LTLIBTHREAD = @LTLIBTHREAD@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +MSGFMT = @MSGFMT@ +MSGFMT_015 = @MSGFMT_015@ +MSGMERGE = @MSGMERGE@ +NEXT_AS_FIRST_DIRECTIVE_CTYPE_H = @NEXT_AS_FIRST_DIRECTIVE_CTYPE_H@ +NEXT_AS_FIRST_DIRECTIVE_DIRENT_H = @NEXT_AS_FIRST_DIRECTIVE_DIRENT_H@ +NEXT_AS_FIRST_DIRECTIVE_ERRNO_H = @NEXT_AS_FIRST_DIRECTIVE_ERRNO_H@ +NEXT_AS_FIRST_DIRECTIVE_FCNTL_H = @NEXT_AS_FIRST_DIRECTIVE_FCNTL_H@ +NEXT_AS_FIRST_DIRECTIVE_FLOAT_H = @NEXT_AS_FIRST_DIRECTIVE_FLOAT_H@ +NEXT_AS_FIRST_DIRECTIVE_GETOPT_H = @NEXT_AS_FIRST_DIRECTIVE_GETOPT_H@ +NEXT_AS_FIRST_DIRECTIVE_ICONV_H = @NEXT_AS_FIRST_DIRECTIVE_ICONV_H@ +NEXT_AS_FIRST_DIRECTIVE_INTTYPES_H = @NEXT_AS_FIRST_DIRECTIVE_INTTYPES_H@ +NEXT_AS_FIRST_DIRECTIVE_LANGINFO_H = @NEXT_AS_FIRST_DIRECTIVE_LANGINFO_H@ +NEXT_AS_FIRST_DIRECTIVE_LOCALE_H = @NEXT_AS_FIRST_DIRECTIVE_LOCALE_H@ +NEXT_AS_FIRST_DIRECTIVE_STDARG_H = @NEXT_AS_FIRST_DIRECTIVE_STDARG_H@ +NEXT_AS_FIRST_DIRECTIVE_STDDEF_H = @NEXT_AS_FIRST_DIRECTIVE_STDDEF_H@ +NEXT_AS_FIRST_DIRECTIVE_STDINT_H = @NEXT_AS_FIRST_DIRECTIVE_STDINT_H@ +NEXT_AS_FIRST_DIRECTIVE_STDIO_H = @NEXT_AS_FIRST_DIRECTIVE_STDIO_H@ +NEXT_AS_FIRST_DIRECTIVE_STDLIB_H = @NEXT_AS_FIRST_DIRECTIVE_STDLIB_H@ +NEXT_AS_FIRST_DIRECTIVE_STRING_H = @NEXT_AS_FIRST_DIRECTIVE_STRING_H@ +NEXT_AS_FIRST_DIRECTIVE_SYS_STAT_H = @NEXT_AS_FIRST_DIRECTIVE_SYS_STAT_H@ +NEXT_AS_FIRST_DIRECTIVE_SYS_TIME_H = @NEXT_AS_FIRST_DIRECTIVE_SYS_TIME_H@ +NEXT_AS_FIRST_DIRECTIVE_SYS_TYPES_H = @NEXT_AS_FIRST_DIRECTIVE_SYS_TYPES_H@ +NEXT_AS_FIRST_DIRECTIVE_TIME_H = @NEXT_AS_FIRST_DIRECTIVE_TIME_H@ +NEXT_AS_FIRST_DIRECTIVE_UNISTD_H = @NEXT_AS_FIRST_DIRECTIVE_UNISTD_H@ +NEXT_AS_FIRST_DIRECTIVE_WCHAR_H = @NEXT_AS_FIRST_DIRECTIVE_WCHAR_H@ +NEXT_AS_FIRST_DIRECTIVE_WCTYPE_H = @NEXT_AS_FIRST_DIRECTIVE_WCTYPE_H@ +NEXT_CTYPE_H = @NEXT_CTYPE_H@ +NEXT_DIRENT_H = @NEXT_DIRENT_H@ +NEXT_ERRNO_H = @NEXT_ERRNO_H@ +NEXT_FCNTL_H = @NEXT_FCNTL_H@ +NEXT_FLOAT_H = @NEXT_FLOAT_H@ +NEXT_GETOPT_H = @NEXT_GETOPT_H@ +NEXT_ICONV_H = @NEXT_ICONV_H@ +NEXT_INTTYPES_H = @NEXT_INTTYPES_H@ +NEXT_LANGINFO_H = @NEXT_LANGINFO_H@ +NEXT_LOCALE_H = @NEXT_LOCALE_H@ +NEXT_STDARG_H = @NEXT_STDARG_H@ +NEXT_STDDEF_H = @NEXT_STDDEF_H@ +NEXT_STDINT_H = @NEXT_STDINT_H@ +NEXT_STDIO_H = @NEXT_STDIO_H@ +NEXT_STDLIB_H = @NEXT_STDLIB_H@ +NEXT_STRING_H = @NEXT_STRING_H@ +NEXT_SYS_STAT_H = @NEXT_SYS_STAT_H@ +NEXT_SYS_TIME_H = @NEXT_SYS_TIME_H@ +NEXT_SYS_TYPES_H = @NEXT_SYS_TYPES_H@ +NEXT_TIME_H = @NEXT_TIME_H@ +NEXT_UNISTD_H = @NEXT_UNISTD_H@ +NEXT_WCHAR_H = @NEXT_WCHAR_H@ +NEXT_WCTYPE_H = @NEXT_WCTYPE_H@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PCRE_CFLAGS = @PCRE_CFLAGS@ +PCRE_LIBS = @PCRE_LIBS@ +PERL = @PERL@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +POSUB = @POSUB@ +PRAGMA_COLUMNS = @PRAGMA_COLUMNS@ +PRAGMA_SYSTEM_HEADER = @PRAGMA_SYSTEM_HEADER@ +PRIPTR_PREFIX = @PRIPTR_PREFIX@ +PRI_MACROS_BROKEN = @PRI_MACROS_BROKEN@ +PTHREAD_H_DEFINES_STRUCT_TIMESPEC = @PTHREAD_H_DEFINES_STRUCT_TIMESPEC@ +PTRDIFF_T_SUFFIX = @PTRDIFF_T_SUFFIX@ +RANLIB = @RANLIB@ +REPLACE_BTOWC = @REPLACE_BTOWC@ +REPLACE_CALLOC = @REPLACE_CALLOC@ +REPLACE_CANONICALIZE_FILE_NAME = @REPLACE_CANONICALIZE_FILE_NAME@ +REPLACE_CHOWN = @REPLACE_CHOWN@ +REPLACE_CLOSE = @REPLACE_CLOSE@ +REPLACE_CLOSEDIR = @REPLACE_CLOSEDIR@ +REPLACE_DIRFD = @REPLACE_DIRFD@ +REPLACE_DPRINTF = @REPLACE_DPRINTF@ +REPLACE_DUP = @REPLACE_DUP@ +REPLACE_DUP2 = @REPLACE_DUP2@ +REPLACE_DUPLOCALE = @REPLACE_DUPLOCALE@ +REPLACE_FCHOWNAT = @REPLACE_FCHOWNAT@ +REPLACE_FCLOSE = @REPLACE_FCLOSE@ +REPLACE_FCNTL = @REPLACE_FCNTL@ +REPLACE_FDOPEN = @REPLACE_FDOPEN@ +REPLACE_FDOPENDIR = @REPLACE_FDOPENDIR@ +REPLACE_FFLUSH = @REPLACE_FFLUSH@ +REPLACE_FOPEN = @REPLACE_FOPEN@ +REPLACE_FPRINTF = @REPLACE_FPRINTF@ +REPLACE_FPURGE = @REPLACE_FPURGE@ +REPLACE_FREOPEN = @REPLACE_FREOPEN@ +REPLACE_FSEEK = @REPLACE_FSEEK@ +REPLACE_FSEEKO = @REPLACE_FSEEKO@ +REPLACE_FSTAT = @REPLACE_FSTAT@ +REPLACE_FSTATAT = @REPLACE_FSTATAT@ +REPLACE_FTELL = @REPLACE_FTELL@ +REPLACE_FTELLO = @REPLACE_FTELLO@ +REPLACE_FTRUNCATE = @REPLACE_FTRUNCATE@ +REPLACE_FUTIMENS = @REPLACE_FUTIMENS@ +REPLACE_GETCWD = @REPLACE_GETCWD@ +REPLACE_GETDELIM = @REPLACE_GETDELIM@ +REPLACE_GETDOMAINNAME = @REPLACE_GETDOMAINNAME@ +REPLACE_GETDTABLESIZE = @REPLACE_GETDTABLESIZE@ +REPLACE_GETGROUPS = @REPLACE_GETGROUPS@ +REPLACE_GETLINE = @REPLACE_GETLINE@ +REPLACE_GETLOGIN_R = @REPLACE_GETLOGIN_R@ +REPLACE_GETPAGESIZE = @REPLACE_GETPAGESIZE@ +REPLACE_GETTIMEOFDAY = @REPLACE_GETTIMEOFDAY@ +REPLACE_GMTIME = @REPLACE_GMTIME@ +REPLACE_ICONV = @REPLACE_ICONV@ +REPLACE_ICONV_OPEN = @REPLACE_ICONV_OPEN@ +REPLACE_ICONV_UTF = @REPLACE_ICONV_UTF@ +REPLACE_ISATTY = @REPLACE_ISATTY@ +REPLACE_ISWBLANK = @REPLACE_ISWBLANK@ +REPLACE_ISWCNTRL = @REPLACE_ISWCNTRL@ +REPLACE_ITOLD = @REPLACE_ITOLD@ +REPLACE_LCHOWN = @REPLACE_LCHOWN@ +REPLACE_LINK = @REPLACE_LINK@ +REPLACE_LINKAT = @REPLACE_LINKAT@ +REPLACE_LOCALECONV = @REPLACE_LOCALECONV@ +REPLACE_LOCALTIME = @REPLACE_LOCALTIME@ +REPLACE_LOCALTIME_R = @REPLACE_LOCALTIME_R@ +REPLACE_LSEEK = @REPLACE_LSEEK@ +REPLACE_LSTAT = @REPLACE_LSTAT@ +REPLACE_MALLOC = @REPLACE_MALLOC@ +REPLACE_MBRLEN = @REPLACE_MBRLEN@ +REPLACE_MBRTOWC = @REPLACE_MBRTOWC@ +REPLACE_MBSINIT = @REPLACE_MBSINIT@ +REPLACE_MBSNRTOWCS = @REPLACE_MBSNRTOWCS@ +REPLACE_MBSRTOWCS = @REPLACE_MBSRTOWCS@ +REPLACE_MBSTATE_T = @REPLACE_MBSTATE_T@ +REPLACE_MBTOWC = @REPLACE_MBTOWC@ +REPLACE_MEMCHR = @REPLACE_MEMCHR@ +REPLACE_MEMMEM = @REPLACE_MEMMEM@ +REPLACE_MKDIR = @REPLACE_MKDIR@ +REPLACE_MKFIFO = @REPLACE_MKFIFO@ +REPLACE_MKNOD = @REPLACE_MKNOD@ +REPLACE_MKSTEMP = @REPLACE_MKSTEMP@ +REPLACE_MKTIME = @REPLACE_MKTIME@ +REPLACE_NANOSLEEP = @REPLACE_NANOSLEEP@ +REPLACE_NL_LANGINFO = @REPLACE_NL_LANGINFO@ +REPLACE_NULL = @REPLACE_NULL@ +REPLACE_OBSTACK_PRINTF = @REPLACE_OBSTACK_PRINTF@ +REPLACE_OPEN = @REPLACE_OPEN@ +REPLACE_OPENAT = @REPLACE_OPENAT@ +REPLACE_OPENDIR = @REPLACE_OPENDIR@ +REPLACE_PERROR = @REPLACE_PERROR@ +REPLACE_POPEN = @REPLACE_POPEN@ +REPLACE_PREAD = @REPLACE_PREAD@ +REPLACE_PRINTF = @REPLACE_PRINTF@ +REPLACE_PTSNAME = @REPLACE_PTSNAME@ +REPLACE_PTSNAME_R = @REPLACE_PTSNAME_R@ +REPLACE_PUTENV = @REPLACE_PUTENV@ +REPLACE_PWRITE = @REPLACE_PWRITE@ +REPLACE_QSORT_R = @REPLACE_QSORT_R@ +REPLACE_RANDOM_R = @REPLACE_RANDOM_R@ +REPLACE_READ = @REPLACE_READ@ +REPLACE_READLINK = @REPLACE_READLINK@ +REPLACE_READLINKAT = @REPLACE_READLINKAT@ +REPLACE_REALLOC = @REPLACE_REALLOC@ +REPLACE_REALPATH = @REPLACE_REALPATH@ +REPLACE_REMOVE = @REPLACE_REMOVE@ +REPLACE_RENAME = @REPLACE_RENAME@ +REPLACE_RENAMEAT = @REPLACE_RENAMEAT@ +REPLACE_RMDIR = @REPLACE_RMDIR@ +REPLACE_SETENV = @REPLACE_SETENV@ +REPLACE_SETLOCALE = @REPLACE_SETLOCALE@ +REPLACE_SLEEP = @REPLACE_SLEEP@ +REPLACE_SNPRINTF = @REPLACE_SNPRINTF@ +REPLACE_SPRINTF = @REPLACE_SPRINTF@ +REPLACE_STAT = @REPLACE_STAT@ +REPLACE_STDIO_READ_FUNCS = @REPLACE_STDIO_READ_FUNCS@ +REPLACE_STDIO_WRITE_FUNCS = @REPLACE_STDIO_WRITE_FUNCS@ +REPLACE_STPNCPY = @REPLACE_STPNCPY@ +REPLACE_STRCASESTR = @REPLACE_STRCASESTR@ +REPLACE_STRCHRNUL = @REPLACE_STRCHRNUL@ +REPLACE_STRDUP = @REPLACE_STRDUP@ +REPLACE_STRERROR = @REPLACE_STRERROR@ +REPLACE_STRERROR_R = @REPLACE_STRERROR_R@ +REPLACE_STRNCAT = @REPLACE_STRNCAT@ +REPLACE_STRNDUP = @REPLACE_STRNDUP@ +REPLACE_STRNLEN = @REPLACE_STRNLEN@ +REPLACE_STRSIGNAL = @REPLACE_STRSIGNAL@ +REPLACE_STRSTR = @REPLACE_STRSTR@ +REPLACE_STRTOD = @REPLACE_STRTOD@ +REPLACE_STRTOIMAX = @REPLACE_STRTOIMAX@ +REPLACE_STRTOK_R = @REPLACE_STRTOK_R@ +REPLACE_STRTOUMAX = @REPLACE_STRTOUMAX@ +REPLACE_STRUCT_LCONV = @REPLACE_STRUCT_LCONV@ +REPLACE_STRUCT_TIMEVAL = @REPLACE_STRUCT_TIMEVAL@ +REPLACE_SYMLINK = @REPLACE_SYMLINK@ +REPLACE_SYMLINKAT = @REPLACE_SYMLINKAT@ +REPLACE_TIMEGM = @REPLACE_TIMEGM@ +REPLACE_TMPFILE = @REPLACE_TMPFILE@ +REPLACE_TOWLOWER = @REPLACE_TOWLOWER@ +REPLACE_TTYNAME_R = @REPLACE_TTYNAME_R@ +REPLACE_UNLINK = @REPLACE_UNLINK@ +REPLACE_UNLINKAT = @REPLACE_UNLINKAT@ +REPLACE_UNSETENV = @REPLACE_UNSETENV@ +REPLACE_USLEEP = @REPLACE_USLEEP@ +REPLACE_UTIMENSAT = @REPLACE_UTIMENSAT@ +REPLACE_VASPRINTF = @REPLACE_VASPRINTF@ +REPLACE_VDPRINTF = @REPLACE_VDPRINTF@ +REPLACE_VFPRINTF = @REPLACE_VFPRINTF@ +REPLACE_VPRINTF = @REPLACE_VPRINTF@ +REPLACE_VSNPRINTF = @REPLACE_VSNPRINTF@ +REPLACE_VSPRINTF = @REPLACE_VSPRINTF@ +REPLACE_WCRTOMB = @REPLACE_WCRTOMB@ +REPLACE_WCSNRTOMBS = @REPLACE_WCSNRTOMBS@ +REPLACE_WCSRTOMBS = @REPLACE_WCSRTOMBS@ +REPLACE_WCSWIDTH = @REPLACE_WCSWIDTH@ +REPLACE_WCTOB = @REPLACE_WCTOB@ +REPLACE_WCTOMB = @REPLACE_WCTOMB@ +REPLACE_WCWIDTH = @REPLACE_WCWIDTH@ +REPLACE_WRITE = @REPLACE_WRITE@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SIG_ATOMIC_T_SUFFIX = @SIG_ATOMIC_T_SUFFIX@ +SIZE_T_SUFFIX = @SIZE_T_SUFFIX@ +STDALIGN_H = @STDALIGN_H@ +STDARG_H = @STDARG_H@ +STDBOOL_H = @STDBOOL_H@ +STDDEF_H = @STDDEF_H@ +STDINT_H = @STDINT_H@ +STRIP = @STRIP@ +SYS_TIME_H_DEFINES_STRUCT_TIMESPEC = @SYS_TIME_H_DEFINES_STRUCT_TIMESPEC@ +TIME_H_DEFINES_STRUCT_TIMESPEC = @TIME_H_DEFINES_STRUCT_TIMESPEC@ +UINT32_MAX_LT_UINTMAX_MAX = @UINT32_MAX_LT_UINTMAX_MAX@ +UINT64_MAX_EQ_ULONG_MAX = @UINT64_MAX_EQ_ULONG_MAX@ +UNDEFINE_STRTOK_R = @UNDEFINE_STRTOK_R@ +UNISTD_H_DEFINES_STRUCT_TIMESPEC = @UNISTD_H_DEFINES_STRUCT_TIMESPEC@ +UNISTD_H_HAVE_WINSOCK2_H = @UNISTD_H_HAVE_WINSOCK2_H@ +UNISTD_H_HAVE_WINSOCK2_H_AND_USE_SOCKETS = @UNISTD_H_HAVE_WINSOCK2_H_AND_USE_SOCKETS@ +USE_NLS = @USE_NLS@ +VERSION = @VERSION@ +WARN_CFLAGS = @WARN_CFLAGS@ +WCHAR_T_SUFFIX = @WCHAR_T_SUFFIX@ +WERROR_CFLAGS = @WERROR_CFLAGS@ +WINDOWS_64_BIT_OFF_T = @WINDOWS_64_BIT_OFF_T@ +WINDOWS_64_BIT_ST_SIZE = @WINDOWS_64_BIT_ST_SIZE@ +WINT_T_SUFFIX = @WINT_T_SUFFIX@ +XGETTEXT = @XGETTEXT@ +XGETTEXT_015 = @XGETTEXT_015@ +XGETTEXT_EXTRA_OPTIONS = @XGETTEXT_EXTRA_OPTIONS@ +abs_aux_dir = @abs_aux_dir@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +am__include = @am__include@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +gl_LIBOBJS = @gl_LIBOBJS@ +gl_LTLIBOBJS = @gl_LTLIBOBJS@ +gltests_LIBOBJS = @gltests_LIBOBJS@ +gltests_LTLIBOBJS = @gltests_LTLIBOBJS@ +gltests_WITNESS = @gltests_WITNESS@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +lispdir = @lispdir@ +localedir = $(datadir)/locale +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +runstatedir = @runstatedir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +LN = ln +AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS) $(PCRE_CFLAGS) + +# Tell the linker to omit references to unused shared libraries. +AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS) +bin_SCRIPTS = egrep fgrep +grep_SOURCES = grep.c searchutils.c \ + dfa.c dfasearch.c \ + kwset.c kwsearch.c \ + pcresearch.c + +noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h + +# Sometimes, the expansion of $(LIBINTL) includes -lc which may +# include modules defining variables like 'optind', so libgreputils.a +# must precede $(LIBINTL) in order to ensure we use GNU getopt. +# But libgreputils.a must also follow $(LIBINTL), since libintl uses +# replacement functions defined in libgreputils.a. +LDADD = \ + ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a $(LIBICONV) \ + $(LIBTHREAD) + +grep_LDADD = $(LDADD) $(PCRE_LIBS) +AM_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib +EXTRA_DIST = dosbuf.c egrep.sh +CLEANFILES = egrep fgrep *-t +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu src/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \ + fi; \ + for p in $$list; do echo "$$p $$p"; done | \ + sed 's/$(EXEEXT)$$//' | \ + while read p p1; do if test -f $$p \ + ; then echo "$$p"; echo "$$p"; else :; fi; \ + done | \ + sed -e 'p;s,.*/,,;n;h' \ + -e 's|.*|.|' \ + -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ + sed 'N;N;N;s,\n, ,g' | \ + $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ + { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ + if ($$2 == $$4) files[d] = files[d] " " $$1; \ + else { print "f", $$3 "/" $$4, $$1; } } \ + END { for (d in files) print "f", d, files[d] }' | \ + while read type dir files; do \ + if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ + test -z "$$files" || { \ + echo " $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ + $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ + } \ + ; done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ + files=`for p in $$list; do echo "$$p"; done | \ + sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ + -e 's/$$/$(EXEEXT)/' \ + `; \ + test -n "$$list" || exit 0; \ + echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ + cd "$(DESTDIR)$(bindir)" && rm -f $$files + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) + +grep$(EXEEXT): $(grep_OBJECTS) $(grep_DEPENDENCIES) $(EXTRA_grep_DEPENDENCIES) + @rm -f grep$(EXEEXT) + $(AM_V_CCLD)$(LINK) $(grep_OBJECTS) $(grep_LDADD) $(LIBS) +install-binSCRIPTS: $(bin_SCRIPTS) + @$(NORMAL_INSTALL) + @list='$(bin_SCRIPTS)'; test -n "$(bindir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \ + done | \ + sed -e 'p;s,.*/,,;n' \ + -e 'h;s|.*|.|' \ + -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \ + $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \ + { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ + if ($$2 == $$4) { files[d] = files[d] " " $$1; \ + if (++n[d] == $(am__install_max)) { \ + print "f", d, files[d]; n[d] = 0; files[d] = "" } } \ + else { print "f", d "/" $$4, $$1 } } \ + END { for (d in files) print "f", d, files[d] }' | \ + while read type dir files; do \ + if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ + test -z "$$files" || { \ + echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(bindir)$$dir'"; \ + $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ + } \ + ; done + +uninstall-binSCRIPTS: + @$(NORMAL_UNINSTALL) + @list='$(bin_SCRIPTS)'; test -n "$(bindir)" || exit 0; \ + files=`for p in $$list; do echo "$$p"; done | \ + sed -e 's,.*/,,;$(transform)'`; \ + dir='$(DESTDIR)$(bindir)'; $(am__uninstall_files_from_dir) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dfa.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dfasearch.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/grep.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kwsearch.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kwset.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pcresearch.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/searchutils.Po@am__quote@ # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.c.o: +@am__fastdepCC_TRUE@ $(AM_V_CC)$(am__set_depbase) && \ +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< && \ +@am__fastdepCC_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< + +.c.obj: +@am__fastdepCC_TRUE@ $(AM_V_CC)$(am__set_depbase) && \ +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $$($(CYGPATH_W) $<) && \ +@am__fastdepCC_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $$($(CYGPATH_W) $<) + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(PROGRAMS) $(SCRIPTS) $(HEADERS) +installdirs: + for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/dfa.Po + -rm -f ./$(DEPDIR)/dfasearch.Po + -rm -f ./$(DEPDIR)/grep.Po + -rm -f ./$(DEPDIR)/kwsearch.Po + -rm -f ./$(DEPDIR)/kwset.Po + -rm -f ./$(DEPDIR)/pcresearch.Po + -rm -f ./$(DEPDIR)/searchutils.Po + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: install-binPROGRAMS install-binSCRIPTS + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/dfa.Po + -rm -f ./$(DEPDIR)/dfasearch.Po + -rm -f ./$(DEPDIR)/grep.Po + -rm -f ./$(DEPDIR)/kwsearch.Po + -rm -f ./$(DEPDIR)/kwset.Po + -rm -f ./$(DEPDIR)/pcresearch.Po + -rm -f ./$(DEPDIR)/searchutils.Po + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-binSCRIPTS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-binPROGRAMS clean-generic cscopelist-am ctags ctags-am \ + distclean distclean-compile distclean-generic distclean-tags \ + distdir dvi dvi-am html html-am info info-am install \ + install-am install-binPROGRAMS install-binSCRIPTS install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic pdf pdf-am ps ps-am tags tags-am uninstall \ + uninstall-am uninstall-binPROGRAMS uninstall-binSCRIPTS + +.PRECIOUS: Makefile + + +egrep fgrep: egrep.sh Makefile + $(AM_V_GEN)grep=`echo grep | sed -e '$(transform)'` && \ + case $@ in egrep) option=-E;; fgrep) option=-F;; esac && \ + shell_does_substrings='set x/y && d=$${1%/*} && test "$$d" = x' && \ + if $(SHELL) -c "$$shell_does_substrings" 2>/dev/null; then \ + edit_substring='s,X,X,'; \ + else \ + edit_substring='s,\$${0%/\*},`expr "X$$0" : '\''X\\(.*\\)/'\''`,g'; \ + fi && \ + sed -e 's|[@]SHELL@|$(SHELL)|g' \ + -e "$$edit_substring" \ + -e "s|[@]grep@|$$grep|g" \ + -e "s|[@]option@|$$option|g" <$(srcdir)/egrep.sh >$@-t + $(AM_V_at)chmod +x $@-t + $(AM_V_at)mv $@-t $@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/dfa.c b/src/dfa.c new file mode 100644 index 0000000..98ee4ac --- /dev/null +++ b/src/dfa.c @@ -0,0 +1,4168 @@ +/* dfa.c - deterministic extended regexp routines for GNU + Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2016 Free Software + Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., + 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ + +/* Written June, 1988 by Mike Haertel + Modified July, 1988 by Arthur David Olson to assist BMG speedups */ + +#include + +#include "dfa.h" + +#include +#include +#include +#include +#include +#include +#include + +#define STREQ(a, b) (strcmp (a, b) == 0) + +/* ISASCIIDIGIT differs from isdigit, as follows: + - Its arg may be any int or unsigned int; it need not be an unsigned char. + - It's guaranteed to evaluate its argument exactly once. + - It's typically faster. + Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that + only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless + it's important to use the locale's definition of "digit" even when the + host does not conform to Posix. */ +#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) + +#include "gettext.h" +#define _(str) gettext (str) + +#include +#include + +/* HPUX defines these as macros in sys/param.h. */ +#ifdef setbit +# undef setbit +#endif +#ifdef clrbit +# undef clrbit +#endif + +/* First integer value that is greater than any character code. */ +enum { NOTCHAR = 1 << CHAR_BIT }; + +/* This represents part of a character class. It must be unsigned and + at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */ +typedef unsigned int charclass_word; + +/* The number of bits used in a charclass word. utf8_classes assumes + this is exactly 32. */ +enum { CHARCLASS_WORD_BITS = 32 }; + +/* The maximum useful value of a charclass_word; all used bits are 1. */ +#define CHARCLASS_WORD_MASK \ + (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1) + +/* Number of words required to hold a bit for every character. */ +enum +{ + CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS +}; + +/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ +typedef charclass_word charclass[CHARCLASS_WORDS]; + +/* Convert a possibly-signed character to an unsigned character. This is + a bit safer than casting to unsigned char, since it catches some type + errors that the cast doesn't. */ +static unsigned char +to_uchar (char ch) +{ + return ch; +} + +/* Contexts tell us whether a character is a newline or a word constituent. + Word-constituent characters are those that satisfy iswalnum, plus '_'. + Each character has a single CTX_* value; bitmasks of CTX_* values denote + a particular character class. + + A state also stores a context value, which is a bitmask of CTX_* values. + A state's context represents a set of characters that the state's + predecessors must match. For example, a state whose context does not + include CTX_LETTER will never have transitions where the previous + character is a word constituent. A state whose context is CTX_ANY + might have transitions from any character. */ + +#define CTX_NONE 1 +#define CTX_LETTER 2 +#define CTX_NEWLINE 4 +#define CTX_ANY 7 + +/* Sometimes characters can only be matched depending on the surrounding + context. Such context decisions depend on what the previous character + was, and the value of the current (lookahead) character. Context + dependent constraints are encoded as 8 bit integers. Each bit that + is set indicates that the constraint succeeds in the corresponding + context. + + bit 8-11 - valid contexts when next character is CTX_NEWLINE + bit 4-7 - valid contexts when next character is CTX_LETTER + bit 0-3 - valid contexts when next character is CTX_NONE + + The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint + succeeds in a particular context. Prev is a bitmask of possible + context values for the previous character, curr is the (single-bit) + context value for the lookahead character. */ +#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf) +#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf) +#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf) + +#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \ + ((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \ + | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \ + | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) & (prev)) + +/* The following macros describe what a constraint depends on. */ +#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111) +#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111) +#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111) + +#define PREV_NEWLINE_DEPENDENT(constraint) \ + (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint)) +#define PREV_LETTER_DEPENDENT(constraint) \ + (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint)) + +/* Tokens that match the empty string subject to some constraint actually + work by applying that constraint to determine what may follow them, + taking into account what has gone before. The following values are + the constraints corresponding to the special tokens previously defined. */ +#define NO_CONSTRAINT 0x777 +#define BEGLINE_CONSTRAINT 0x444 +#define ENDLINE_CONSTRAINT 0x700 +#define BEGWORD_CONSTRAINT 0x050 +#define ENDWORD_CONSTRAINT 0x202 +#define LIMWORD_CONSTRAINT 0x252 +#define NOTLIMWORD_CONSTRAINT 0x525 + +/* The regexp is parsed into an array of tokens in postfix form. Some tokens + are operators and others are terminal symbols. Most (but not all) of these + codes are returned by the lexical analyzer. */ + +typedef ptrdiff_t token; + +/* Predefined token values. */ +enum +{ + END = -1, /* END is a terminal symbol that matches the + end of input; any value of END or less in + the parse tree is such a symbol. Accepting + states of the DFA are those that would have + a transition on END. */ + + /* Ordinary character values are terminal symbols that match themselves. */ + + EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches + the empty string. */ + + BACKREF, /* BACKREF is generated by \ + or by any other construct that + is not completely handled. If the scanner + detects a transition on backref, it returns + a kind of "semi-success" indicating that + the match will have to be verified with + a backtracking matcher. */ + + BEGLINE, /* BEGLINE is a terminal symbol that matches + the empty string at the beginning of a + line. */ + + ENDLINE, /* ENDLINE is a terminal symbol that matches + the empty string at the end of a line. */ + + BEGWORD, /* BEGWORD is a terminal symbol that matches + the empty string at the beginning of a + word. */ + + ENDWORD, /* ENDWORD is a terminal symbol that matches + the empty string at the end of a word. */ + + LIMWORD, /* LIMWORD is a terminal symbol that matches + the empty string at the beginning or the + end of a word. */ + + NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that + matches the empty string not at + the beginning or end of a word. */ + + QMARK, /* QMARK is an operator of one argument that + matches zero or one occurrences of its + argument. */ + + STAR, /* STAR is an operator of one argument that + matches the Kleene closure (zero or more + occurrences) of its argument. */ + + PLUS, /* PLUS is an operator of one argument that + matches the positive closure (one or more + occurrences) of its argument. */ + + REPMN, /* REPMN is a lexical token corresponding + to the {m,n} construct. REPMN never + appears in the compiled token vector. */ + + CAT, /* CAT is an operator of two arguments that + matches the concatenation of its + arguments. CAT is never returned by the + lexical analyzer. */ + + OR, /* OR is an operator of two arguments that + matches either of its arguments. */ + + LPAREN, /* LPAREN never appears in the parse tree, + it is only a lexeme. */ + + RPAREN, /* RPAREN never appears in the parse tree. */ + + ANYCHAR, /* ANYCHAR is a terminal symbol that matches + a valid multibyte (or single byte) character. + It is used only if MB_CUR_MAX > 1. */ + + MBCSET, /* MBCSET is similar to CSET, but for + multibyte characters. */ + + WCHAR, /* Only returned by lex. wctok contains + the wide character representation. */ + + CSET /* CSET and (and any value greater) is a + terminal symbol that matches any of a + class of characters. */ +}; + + +/* States of the recognizer correspond to sets of positions in the parse + tree, together with the constraints under which they may be matched. + So a position is encoded as an index into the parse tree together with + a constraint. */ +typedef struct +{ + size_t index; /* Index into the parse array. */ + unsigned int constraint; /* Constraint for matching this position. */ +} position; + +/* Sets of positions are stored as arrays. */ +typedef struct +{ + position *elems; /* Elements of this position set. */ + size_t nelem; /* Number of elements in this set. */ + size_t alloc; /* Number of elements allocated in ELEMS. */ +} position_set; + +/* Sets of leaves are also stored as arrays. */ +typedef struct +{ + size_t *elems; /* Elements of this position set. */ + size_t nelem; /* Number of elements in this set. */ +} leaf_set; + +/* A state of the dfa consists of a set of positions, some flags, + and the token value of the lowest-numbered position of the state that + contains an END token. */ +typedef struct +{ + size_t hash; /* Hash of the positions of this state. */ + position_set elems; /* Positions this state could match. */ + unsigned char context; /* Context from previous state. */ + unsigned short constraint; /* Constraint for this state to accept. */ + token first_end; /* Token value of the first END in elems. */ + position_set mbps; /* Positions which can match multibyte + characters, e.g., period. + Used only if MB_CUR_MAX > 1. */ +} dfa_state; + +/* States are indexed by state_num values. These are normally + nonnegative but -1 is used as a special value. */ +typedef ptrdiff_t state_num; + +/* A bracket operator. + e.g., [a-c], [[:alpha:]], etc. */ +struct mb_char_classes +{ + ptrdiff_t cset; + bool invert; + wchar_t *chars; /* Normal characters. */ + size_t nchars; +}; + +/* A compiled regular expression. */ +struct dfa +{ + /* Fields filled by the scanner. */ + charclass *charclasses; /* Array of character sets for CSET tokens. */ + size_t cindex; /* Index for adding new charclasses. */ + size_t calloc; /* Number of charclasses allocated. */ + + /* Fields filled by the parser. */ + token *tokens; /* Postfix parse array. */ + size_t tindex; /* Index for adding new tokens. */ + size_t talloc; /* Number of tokens currently allocated. */ + size_t depth; /* Depth required of an evaluation stack + used for depth-first traversal of the + parse tree. */ + size_t nleaves; /* Number of leaves on the parse tree. */ + size_t nregexps; /* Count of parallel regexps being built + with dfaparse. */ + bool fast; /* The DFA is fast. */ + bool multibyte; /* MB_CUR_MAX > 1. */ + token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ + mbstate_t mbs; /* Multibyte conversion state. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, int, size_t *, int *); + + /* The following are valid only if MB_CUR_MAX > 1. */ + + /* The value of multibyte_prop[i] is defined by following rule. + if tokens[i] < NOTCHAR + bit 0 : tokens[i] is the first byte of a character, including + single-byte characters. + bit 1 : tokens[i] is the last byte of a character, including + single-byte characters. + + if tokens[i] = MBCSET + ("the index of mbcsets corresponding to this operator" << 2) + 3 + + e.g. + tokens + = 'single_byte_a', 'multi_byte_A', single_byte_b' + = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b' + multibyte_prop + = 3 , 1 , 0 , 2 , 3 + */ + int *multibyte_prop; + + /* Array of the bracket expression in the DFA. */ + struct mb_char_classes *mbcsets; + size_t nmbcsets; + size_t mbcsets_alloc; + + /* Fields filled by the superset. */ + struct dfa *superset; /* Hint of the dfa. */ + + /* Fields filled by the state builder. */ + dfa_state *states; /* States of the dfa. */ + state_num sindex; /* Index for adding new states. */ + size_t salloc; /* Number of states currently allocated. */ + + /* Fields filled by the parse tree->NFA conversion. */ + position_set *follows; /* Array of follow sets, indexed by position + index. The follow of a position is the set + of positions containing characters that + could conceivably follow a character + matching the given position in a string + matching the regexp. Allocated to the + maximum possible position index. */ + bool searchflag; /* We are supposed to build a searching + as opposed to an exact matcher. A searching + matcher finds the first and shortest string + matching a regexp anywhere in the buffer, + whereas an exact matcher finds the longest + string matching, but anchored to the + beginning of the buffer. */ + + /* Fields filled by dfaexec. */ + state_num tralloc; /* Number of transition tables that have + slots so far, not counting trans[-1]. */ + int trcount; /* Number of transition tables that have + actually been built. */ + int min_trcount; /* Minimum of number of transition tables. + Always keep the number, even after freeing + the transition tables. It is also the + number of initial states. */ + state_num **trans; /* Transition tables for states that can + never accept. If the transitions for a + state have not yet been computed, or the + state could possibly accept, its entry in + this table is NULL. This points to one + past the start of the allocated array, + and trans[-1] is always NULL. */ + state_num **fails; /* Transition tables after failing to accept + on a state that potentially could do so. */ + int *success; /* Table of acceptance conditions used in + dfaexec and computed in build_state. */ + state_num *newlines; /* Transitions on newlines. The entry for a + newline in any transition table is always + -1 so we can count lines without wasting + too many cycles. The transition for a + newline is stored separately and handled + as a special case. Newline is also used + as a sentinel at the end of the buffer. */ + state_num initstate_letter; /* Initial state for letter context. */ + state_num initstate_others; /* Initial state for other contexts. */ + position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET + on demand. */ + int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or + MBCSET. Null if mb_follows.elems has not + been allocated. */ +}; + +/* Some macros for user access to dfa internals. */ + +/* S could possibly be an accepting state of R. */ +#define ACCEPTING(s, r) ((r).states[s].constraint) + +/* STATE accepts in the specified context. */ +#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \ + SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr) + +static void regexp (void); + +/* A table indexed by byte values that contains the corresponding wide + character (if any) for that byte. WEOF means the byte is not a + valid single-byte character. */ +static wint_t mbrtowc_cache[NOTCHAR]; + +/* Store into *PWC the result of converting the leading bytes of the + multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + and updating the conversion state in *D. On conversion error, + convert just a single byte, to WEOF. Return the number of bytes + converted. + + This differs from mbrtowc (PWC, S, N, &D->mbs) as follows: + + * PWC points to wint_t, not to wchar_t. + * The last arg is a dfa *D instead of merely a multibyte conversion + state D->mbs. D also contains an mbrtowc_cache for speed. + * N must be at least 1. + * S[N - 1] must be a sentinel byte. + * Shift encodings are not supported. + * The return value is always in the range 1..N. + * D->mbs is always valid afterwards. + * *PWC is always set to something. */ +static size_t +mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) +{ + unsigned char uc = s[0]; + wint_t wc = mbrtowc_cache[uc]; + + if (wc == WEOF) + { + wchar_t wch; + size_t nbytes = mbrtowc (&wch, s, n, &d->mbs); + if (0 < nbytes && nbytes < (size_t) -2) + { + *pwc = wch; + return nbytes; + } + memset (&d->mbs, 0, sizeof d->mbs); + } + + *pwc = wc; + return 1; +} + +#ifdef DEBUG + +static void +prtok (token t) +{ + char const *s; + + if (t < 0) + fprintf (stderr, "END"); + else if (t < NOTCHAR) + { + unsigned int ch = t; + fprintf (stderr, "0x%02x", ch); + } + else + { + switch (t) + { + case EMPTY: + s = "EMPTY"; + break; + case BACKREF: + s = "BACKREF"; + break; + case BEGLINE: + s = "BEGLINE"; + break; + case ENDLINE: + s = "ENDLINE"; + break; + case BEGWORD: + s = "BEGWORD"; + break; + case ENDWORD: + s = "ENDWORD"; + break; + case LIMWORD: + s = "LIMWORD"; + break; + case NOTLIMWORD: + s = "NOTLIMWORD"; + break; + case QMARK: + s = "QMARK"; + break; + case STAR: + s = "STAR"; + break; + case PLUS: + s = "PLUS"; + break; + case CAT: + s = "CAT"; + break; + case OR: + s = "OR"; + break; + case LPAREN: + s = "LPAREN"; + break; + case RPAREN: + s = "RPAREN"; + break; + case ANYCHAR: + s = "ANYCHAR"; + break; + case MBCSET: + s = "MBCSET"; + break; + default: + s = "CSET"; + break; + } + fprintf (stderr, "%s", s); + } +} +#endif /* DEBUG */ + +/* Stuff pertaining to charclasses. */ + +static bool +tstbit (unsigned int b, charclass const c) +{ + return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1; +} + +static void +setbit (unsigned int b, charclass c) +{ + c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS; +} + +static void +clrbit (unsigned int b, charclass c) +{ + c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1 + << b % CHARCLASS_WORD_BITS); +} + +static void +copyset (charclass const src, charclass dst) +{ + memcpy (dst, src, sizeof (charclass)); +} + +static void +zeroset (charclass s) +{ + memset (s, 0, sizeof (charclass)); +} + +static void +notset (charclass s) +{ + int i; + + for (i = 0; i < CHARCLASS_WORDS; ++i) + s[i] = CHARCLASS_WORD_MASK & ~s[i]; +} + +static bool +equal (charclass const s1, charclass const s2) +{ + return memcmp (s1, s2, sizeof (charclass)) == 0; +} + +/* Ensure that the array addressed by PTR holds at least NITEMS + + (PTR || !NITEMS) items. Either return PTR, or reallocate the array + and return its new address. Although PTR may be null, the returned + value is never null. + + The array holds *NALLOC items; *NALLOC is updated on reallocation. + ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays + growing linearly. */ +static void * +maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) +{ + if (nitems < *nalloc) + return ptr; + *nalloc = nitems; + return x2nrealloc (ptr, nalloc, itemsize); +} + +/* In DFA D, find the index of charclass S, or allocate a new one. */ +static size_t +dfa_charclass_index (struct dfa *d, charclass const s) +{ + size_t i; + + for (i = 0; i < d->cindex; ++i) + if (equal (s, d->charclasses[i])) + return i; + d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc, + sizeof *d->charclasses); + ++d->cindex; + copyset (s, d->charclasses[i]); + return i; +} + +/* A pointer to the current dfa is kept here during parsing. */ +static struct dfa *dfa; + +/* Find the index of charclass S in the current DFA, or allocate a new one. */ +static size_t +charclass_index (charclass const s) +{ + return dfa_charclass_index (dfa, s); +} + +/* Syntax bits controlling the behavior of the lexical analyzer. */ +static reg_syntax_t syntax_bits, syntax_bits_set; + +/* Flag for case-folding letters into sets. */ +static bool case_fold; + +/* End-of-line byte in data. */ +static unsigned char eolbyte; + +/* Cache of char-context values. */ +static int sbit[NOTCHAR]; + +/* Set of characters considered letters. */ +static charclass letters; + +/* Set of characters that are newline. */ +static charclass newline; + +static bool +unibyte_word_constituent (unsigned char c) +{ + return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); +} + +static int +char_context (unsigned char c) +{ + if (c == eolbyte) + return CTX_NEWLINE; + if (unibyte_word_constituent (c)) + return CTX_LETTER; + return CTX_NONE; +} + +static int +wchar_context (wint_t wc) +{ + if (wc == (wchar_t) eolbyte || wc == 0) + return CTX_NEWLINE; + if (wc == L'_' || iswalnum (wc)) + return CTX_LETTER; + return CTX_NONE; +} + +/* Entry point to set syntax options. */ +void +dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) +{ + int i; + syntax_bits_set = 1; + syntax_bits = bits; + case_fold = fold != 0; + eolbyte = eol; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t s = { 0 }; + wchar_t wc; + mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; + + /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ + sbit[uc] = char_context (uc); + switch (sbit[uc]) + { + case CTX_LETTER: + setbit (uc, letters); + break; + case CTX_NEWLINE: + setbit (uc, newline); + break; + } + } +} + +/* Set a bit in the charclass for the given wchar_t. Do nothing if WC + is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, + this may happen when folding case in weird Turkish locales where + dotless i/dotted I are not included in the chosen character set. + Return whether a bit was set in the charclass. */ +static bool +setbit_wc (wint_t wc, charclass c) +{ + int b = wctob (wc); + if (b == EOF) + return false; + + setbit (b, c); + return true; +} + +/* Set a bit for B and its case variants in the charclass C. + MB_CUR_MAX must be 1. */ +static void +setbit_case_fold_c (int b, charclass c) +{ + int ub = toupper (b); + int i; + for (i = 0; i < NOTCHAR; i++) + if (toupper (i) == ub) + setbit (i, c); +} + + + +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +int +using_utf8 (void) +{ + static int utf8 = -1; + if (utf8 < 0) + { + wchar_t wc; + mbstate_t mbs = { 0 }; + utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; + } + return utf8; +} + +/* The current locale is known to be a unibyte locale + without multicharacter collating sequences and where range + comparisons simply use the native encoding. These locales can be + processed more efficiently. */ + +static bool +using_simple_locale (void) +{ + /* The native character set is known to be compatible with + the C locale. The following test isn't perfect, but it's good + enough in practice, as only ASCII and EBCDIC are in common use + and this test correctly accepts ASCII and rejects EBCDIC. */ + enum { native_c_charset = + ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 + && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 + && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 + && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 + && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 + && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 + && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 + && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 + && '}' == 125 && '~' == 126) + }; + + if (! native_c_charset || dfa->multibyte) + return false; + else + { + static int unibyte_c = -1; + if (unibyte_c < 0) + { + char const *locale = setlocale (LC_ALL, NULL); + unibyte_c = (!locale + || STREQ (locale, "C") + || STREQ (locale, "POSIX")); + } + return unibyte_c; + } +} + +/* Lexical analyzer. All the dross that deals with the obnoxious + GNU Regex syntax bits is located here. The poor, suffering + reader is referred to the GNU Regex documentation for the + meaning of the @#%!@#%^!@ syntax bits. */ + +static char const *lexptr; /* Pointer to next input character. */ +static size_t lexleft; /* Number of characters remaining. */ +static token lasttok; /* Previous token returned; initially END. */ +static bool laststart; /* We're separated from beginning or (, + | only by zero-width characters. */ +static size_t parens; /* Count of outstanding left parens. */ +static int minrep, maxrep; /* Repeat counts for {m,n}. */ + +static int cur_mb_len = 1; /* Length of the multibyte representation of + wctok. */ + +static wint_t wctok; /* Wide character representation of the current + multibyte character, or WEOF if there was + an encoding error. Used only if + MB_CUR_MAX > 1. */ + + +/* Fetch the next lexical input character. Set C (of type int) to the + next input byte, except set C to EOF if the input is a multibyte + character of length greater than 1. Set WC (of type wint_t) to the + value of the input if it is a valid multibyte character (possibly + of length 1); otherwise set WC to WEOF. If there is no more input, + report EOFERR if EOFERR is not null, and return lasttok = END + otherwise. */ +# define FETCH_WC(c, wc, eoferr) \ + do { \ + if (! lexleft) \ + { \ + if ((eoferr) != 0) \ + dfaerror (eoferr); \ + else \ + return lasttok = END; \ + } \ + else \ + { \ + wint_t _wc; \ + size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \ + cur_mb_len = nbytes; \ + (wc) = _wc; \ + (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \ + lexptr += nbytes; \ + lexleft -= nbytes; \ + } \ + } while (0) + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER. */ +enum +{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ +static unsigned int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + unsigned int i; + unsigned int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} + +typedef int predicate (int); + +/* The following list maps the names of the Posix named character classes + to predicate functions that determine whether a given character is in + the class. The leading [ has already been eaten by the lexical + analyzer. */ +struct dfa_ctype +{ + const char *name; + predicate *func; + bool single_byte_only; +}; + +static const struct dfa_ctype prednames[] = { + {"alpha", isalpha, false}, + {"upper", isupper, false}, + {"lower", islower, false}, + {"digit", isdigit, true}, + {"xdigit", isxdigit, false}, + {"space", isspace, false}, + {"punct", ispunct, false}, + {"alnum", isalnum, false}, + {"print", isprint, false}, + {"graph", isgraph, false}, + {"cntrl", iscntrl, false}, + {"blank", isblank, false}, + {NULL, NULL, false} +}; + +static const struct dfa_ctype *_GL_ATTRIBUTE_PURE +find_pred (const char *str) +{ + unsigned int i; + for (i = 0; prednames[i].name; ++i) + if (STREQ (str, prednames[i].name)) + return &prednames[i]; + return NULL; +} + +/* Multibyte character handling sub-routine for lex. + Parse a bracket expression and build a struct mb_char_classes. */ +static token +parse_bracket_exp (void) +{ + bool invert; + int c, c1, c2; + charclass ccl; + + /* This is a bracket expression that dfaexec is known to + process correctly. */ + bool known_bracket_exp = true; + + /* Used to warn about [:space:]. + Bit 0 = first character is a colon. + Bit 1 = last character is a colon. + Bit 2 = includes any other character but a colon. + Bit 3 = includes ranges, char/equiv classes or collation elements. */ + int colon_warning_state; + + wint_t wc; + wint_t wc2; + wint_t wc1 = 0; + + /* Work area to build a mb_char_classes. */ + struct mb_char_classes *work_mbc; + size_t chars_al; + + chars_al = 0; + if (dfa->multibyte) + { + dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, + &dfa->mbcsets_alloc, + sizeof *dfa->mbcsets); + + /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. + We will update dfa->multibyte_prop[] in addtok, because we can't + decide the index in dfa->tokens[]. */ + + /* Initialize work area. */ + work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); + memset (work_mbc, 0, sizeof *work_mbc); + } + else + work_mbc = NULL; + + memset (ccl, 0, sizeof ccl); + FETCH_WC (c, wc, _("unbalanced [")); + if (c == '^') + { + FETCH_WC (c, wc, _("unbalanced [")); + invert = true; + known_bracket_exp = using_simple_locale (); + } + else + invert = false; + + colon_warning_state = (c == ':'); + do + { + c1 = NOTCHAR; /* Mark c1 as not initialized. */ + colon_warning_state &= ~2; + + /* Note that if we're looking at some other [:...:] construct, + we just treat it as a bunch of ordinary characters. We can do + this because we assume regex has checked for syntax errors before + dfa is ever called. */ + if (c == '[') + { + FETCH_WC (c1, wc1, _("unbalanced [")); + + if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES)) + || c1 == '.' || c1 == '=') + { + enum { MAX_BRACKET_STRING_LEN = 32 }; + char str[MAX_BRACKET_STRING_LEN + 1]; + size_t len = 0; + for (;;) + { + FETCH_WC (c, wc, _("unbalanced [")); + if ((c == c1 && *lexptr == ']') || lexleft == 0) + break; + if (len < MAX_BRACKET_STRING_LEN) + str[len++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[len] = '\0'; + + /* Fetch bracket. */ + FETCH_WC (c, wc, _("unbalanced [")); + if (c1 == ':') + /* Build character class. POSIX allows character + classes to match multicharacter collating elements, + but the regex code does not support that, so do not + worry about that possibility. */ + { + char const *class + = (case_fold && (STREQ (str, "upper") + || STREQ (str, "lower")) ? "alpha" : str); + const struct dfa_ctype *pred = find_pred (class); + if (!pred) + dfaerror (_("invalid character class")); + + if (dfa->multibyte && !pred->single_byte_only) + known_bracket_exp = false; + else + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (pred->func (c2)) + setbit (c2, ccl); + } + else + known_bracket_exp = false; + + colon_warning_state |= 8; + + /* Fetch new lookahead character. */ + FETCH_WC (c1, wc1, _("unbalanced [")); + continue; + } + + /* We treat '[' as a normal character here. c/c1/wc/wc1 + are already set up. */ + } + + if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH_WC (c, wc, _("unbalanced [")); + + if (c1 == NOTCHAR) + FETCH_WC (c1, wc1, _("unbalanced [")); + + if (c1 == '-') + /* build range characters. */ + { + FETCH_WC (c2, wc2, _("unbalanced [")); + + /* A bracket expression like [a-[.aa.]] matches an unknown set. + Treat it like [-a[.aa.]] while parsing it, and + remember that the set is unknown. */ + if (c2 == '[' && *lexptr == '.') + { + known_bracket_exp = false; + c2 = ']'; + } + + if (c2 == ']') + { + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + lexptr -= cur_mb_len; + lexleft += cur_mb_len; + } + else + { + if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH_WC (c2, wc2, _("unbalanced [")); + + colon_warning_state |= 8; + FETCH_WC (c1, wc1, _("unbalanced [")); + + /* Treat [x-y] as a range if x != y. */ + if (wc != wc2 || wc == WEOF) + { + if (dfa->multibyte) + known_bracket_exp = false; + else if (using_simple_locale ()) + { + int ci; + for (ci = c; ci <= c2; ci++) + setbit (ci, ccl); + if (case_fold) + { + int uc = toupper (c); + int uc2 = toupper (c2); + for (ci = 0; ci < NOTCHAR; ci++) + { + int uci = toupper (ci); + if (uc <= uci && uci <= uc2) + setbit (ci, ccl); + } + } + } + else + known_bracket_exp = false; + + continue; + } + } + } + + colon_warning_state |= (c == ':') ? 2 : 4; + + if (!dfa->multibyte) + { + if (case_fold) + setbit_case_fold_c (c, ccl); + else + setbit (c, ccl); + continue; + } + + if (wc == WEOF) + known_bracket_exp = false; + else + { + wchar_t folded[CASE_FOLDED_BUFSIZE + 1]; + unsigned int i; + unsigned int n = (case_fold + ? case_folded_counterparts (wc, folded + 1) + 1 + : 1); + folded[0] = wc; + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + { + work_mbc->chars + = maybe_realloc (work_mbc->chars, work_mbc->nchars, + &chars_al, sizeof *work_mbc->chars); + work_mbc->chars[work_mbc->nchars++] = folded[i]; + } + } + } + while ((wc = wc1, (c = c1) != ']')); + + if (colon_warning_state == 7) + dfawarn (_("character class syntax is [[:space:]], not [:space:]")); + + if (! known_bracket_exp) + return BACKREF; + + if (dfa->multibyte) + { + static charclass zeroclass; + work_mbc->invert = invert; + work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl); + return MBCSET; + } + + if (invert) + { + assert (!dfa->multibyte); + notset (ccl); + if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) + clrbit ('\n', ccl); + } + + return CSET + charclass_index (ccl); +} + +#define PUSH_LEX_STATE(s) \ + do \ + { \ + char const *lexptr_saved = lexptr; \ + size_t lexleft_saved = lexleft; \ + lexptr = (s); \ + lexleft = strlen (lexptr) + +#define POP_LEX_STATE() \ + lexptr = lexptr_saved; \ + lexleft = lexleft_saved; \ + } \ + while (0) + +static token +lex (void) +{ + int c, c2; + bool backslash = false; + charclass ccl; + int i; + + /* Basic plan: We fetch a character. If it's a backslash, + we set the backslash flag and go through the loop again. + On the plus side, this avoids having a duplicate of the + main switch inside the backslash case. On the minus side, + it means that just about every case begins with + "if (backslash) ...". */ + for (i = 0; i < 2; ++i) + { + FETCH_WC (c, wctok, NULL); + + switch (c) + { + case '\\': + if (backslash) + goto normal_char; + if (lexleft == 0) + dfaerror (_("unfinished \\ escape")); + backslash = true; + break; + + case '^': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lasttok == END || lasttok == LPAREN || lasttok == OR) + return lasttok = BEGLINE; + goto normal_char; + + case '$': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lexleft == 0 + || (syntax_bits & RE_NO_BK_PARENS + ? lexleft > 0 && *lexptr == ')' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') + || (syntax_bits & RE_NO_BK_VBAR + ? lexleft > 0 && *lexptr == '|' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') + || ((syntax_bits & RE_NEWLINE_ALT) + && lexleft > 0 && *lexptr == '\n')) + return lasttok = ENDLINE; + goto normal_char; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (backslash && !(syntax_bits & RE_NO_BK_REFS)) + { + laststart = false; + return lasttok = BACKREF; + } + goto normal_char; + + case '`': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = BEGLINE; /* FIXME: should be beginning of string */ + goto normal_char; + + case '\'': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = ENDLINE; /* FIXME: should be end of string */ + goto normal_char; + + case '<': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = BEGWORD; + goto normal_char; + + case '>': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = ENDWORD; + goto normal_char; + + case 'b': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = LIMWORD; + goto normal_char; + + case 'B': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = NOTLIMWORD; + goto normal_char; + + case '?': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = QMARK; + + case '*': + if (backslash) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = STAR; + + case '+': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = PLUS; + + case '{': + if (!(syntax_bits & RE_INTERVALS)) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + + /* Cases: + {M} - exact count + {M,} - minimum count, maximum is infinity + {,N} - 0 through N + {,} - 0 to infinity (same as '*') + {M,N} - M through N */ + { + char const *p = lexptr; + char const *lim = p + lexleft; + minrep = maxrep = -1; + for (; p != lim && ISASCIIDIGIT (*p); p++) + { + if (minrep < 0) + minrep = *p - '0'; + else + minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0'); + } + if (p != lim) + { + if (*p != ',') + maxrep = minrep; + else + { + if (minrep < 0) + minrep = 0; + while (++p != lim && ISASCIIDIGIT (*p)) + { + if (maxrep < 0) + maxrep = *p - '0'; + else + maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0'); + } + } + } + if (! ((! backslash || (p != lim && *p++ == '\\')) + && p != lim && *p++ == '}' + && 0 <= minrep && (maxrep < 0 || minrep <= maxrep))) + { + if (syntax_bits & RE_INVALID_INTERVAL_ORD) + goto normal_char; + dfaerror (_("invalid content of \\{\\}")); + } + if (RE_DUP_MAX < maxrep) + dfaerror (_("regular expression too big")); + lexptr = p; + lexleft = lim - p; + } + laststart = false; + return lasttok = REPMN; + + case '|': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) + goto normal_char; + laststart = true; + return lasttok = OR; + + case '\n': + if (syntax_bits & RE_LIMITED_OPS + || backslash || !(syntax_bits & RE_NEWLINE_ALT)) + goto normal_char; + laststart = true; + return lasttok = OR; + + case '(': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + ++parens; + laststart = true; + return lasttok = LPAREN; + + case ')': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + --parens; + laststart = false; + return lasttok = RPAREN; + + case '.': + if (backslash) + goto normal_char; + if (dfa->multibyte) + { + /* In multibyte environment period must match with a single + character not a byte. So we use ANYCHAR. */ + laststart = false; + return lasttok = ANYCHAR; + } + zeroset (ccl); + notset (ccl); + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + laststart = false; + return lasttok = CSET + charclass_index (ccl); + + case 's': + case 'S': + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) + goto normal_char; + if (!dfa->multibyte) + { + zeroset (ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (isspace (c2)) + setbit (c2, ccl); + if (c == 'S') + notset (ccl); + laststart = false; + return lasttok = CSET + charclass_index (ccl); + } + + /* FIXME: see if optimizing this, as is done with ANYCHAR and + add_utf8_anychar, makes sense. */ + + /* \s and \S are documented to be equivalent to [[:space:]] and + [^[:space:]] respectively, so tell the lexer to process those + strings, each minus its "already processed" '['. */ + PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); + + lasttok = parse_bracket_exp (); + + POP_LEX_STATE (); + + laststart = false; + return lasttok; + + case 'w': + case 'W': + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) + goto normal_char; + + if (!dfa->multibyte) + { + zeroset (ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (unibyte_word_constituent (c2)) + setbit (c2, ccl); + if (c == 'W') + notset (ccl); + laststart = false; + return lasttok = CSET + charclass_index (ccl); + } + + /* FIXME: see if optimizing this, as is done with ANYCHAR and + add_utf8_anychar, makes sense. */ + + /* \w and \W are documented to be equivalent to [_[:alnum:]] and + [^_[:alnum:]] respectively, so tell the lexer to process those + strings, each minus its "already processed" '['. */ + PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); + + lasttok = parse_bracket_exp (); + + POP_LEX_STATE (); + + laststart = false; + return lasttok; + + case '[': + if (backslash) + goto normal_char; + laststart = false; + return lasttok = parse_bracket_exp (); + + default: + normal_char: + laststart = false; + /* For multibyte character sets, folding is done in atom. Always + return WCHAR. */ + if (dfa->multibyte) + return lasttok = WCHAR; + + if (case_fold && isalpha (c)) + { + zeroset (ccl); + setbit_case_fold_c (c, ccl); + return lasttok = CSET + charclass_index (ccl); + } + + return lasttok = c; + } + } + + /* The above loop should consume at most a backslash + and some other character. */ + abort (); + return END; /* keeps pedantic compilers happy. */ +} + +/* Recursive descent parser for regular expressions. */ + +static token tok; /* Lookahead token. */ +static size_t depth; /* Current depth of a hypothetical stack + holding deferred productions. This is + used to determine the depth that will be + required of the real stack later on in + dfaanalyze. */ + +static void +addtok_mb (token t, int mbprop) +{ + if (dfa->talloc == dfa->tindex) + { + dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, + sizeof *dfa->tokens); + if (dfa->multibyte) + dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, + sizeof *dfa->multibyte_prop); + } + if (dfa->multibyte) + dfa->multibyte_prop[dfa->tindex] = mbprop; + dfa->tokens[dfa->tindex++] = t; + + switch (t) + { + case QMARK: + case STAR: + case PLUS: + break; + + case CAT: + case OR: + --depth; + break; + + case BACKREF: + dfa->fast = false; + /* fallthrough */ + default: + ++dfa->nleaves; + /* fallthrough */ + case EMPTY: + ++depth; + break; + } + if (depth > dfa->depth) + dfa->depth = depth; +} + +static void addtok_wc (wint_t wc); + +/* Add the given token to the parse tree, maintaining the depth count and + updating the maximum depth if necessary. */ +static void +addtok (token t) +{ + if (dfa->multibyte && t == MBCSET) + { + bool need_or = false; + struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; + size_t i; + + /* Extract wide characters into alternations for better performance. + This does not require UTF-8. */ + for (i = 0; i < work_mbc->nchars; i++) + { + addtok_wc (work_mbc->chars[i]); + if (need_or) + addtok (OR); + need_or = true; + } + work_mbc->nchars = 0; + + /* Characters have been handled above, so it is possible + that the mbcset is empty now. Do nothing in that case. */ + if (work_mbc->cset != -1) + { + addtok (CSET + work_mbc->cset); + if (need_or) + addtok (OR); + } + } + else + { + addtok_mb (t, 3); + } +} + +/* We treat a multibyte character as a single atom, so that DFA + can treat a multibyte character as a single expression. + + e.g., we construct the following tree from "". + + */ +static void +addtok_wc (wint_t wc) +{ + unsigned char buf[MB_LEN_MAX]; + mbstate_t s = { 0 }; + int i; + size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); + + if (stored_bytes != (size_t) -1) + cur_mb_len = stored_bytes; + else + { + /* This is merely stop-gap. buf[0] is undefined, yet skipping + the addtok_mb call altogether can corrupt the heap. */ + cur_mb_len = 1; + buf[0] = 0; + } + + addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1); + for (i = 1; i < cur_mb_len; i++) + { + addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0); + addtok (CAT); + } +} + +static void +add_utf8_anychar (void) +{ + static charclass const utf8_classes[5] = { + /* 80-bf: non-leading bytes. */ + {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0}, + + /* 00-7f: 1-byte sequence. */ + {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, + CHARCLASS_WORD_MASK, 0, 0, 0, 0}, + + /* c2-df: 2-byte sequence. */ + {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0}, + + /* e0-ef: 3-byte sequence. */ + {0, 0, 0, 0, 0, 0, 0, 0xffff}, + + /* f0-f7: 4-byte sequence. */ + {0, 0, 0, 0, 0, 0, 0, 0xff0000} + }; + const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]); + unsigned int i; + + /* Define the five character classes that are needed below. */ + if (dfa->utf8_anychar_classes[0] == 0) + for (i = 0; i < n; i++) + { + charclass c; + copyset (utf8_classes[i], c); + if (i == 1) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', c); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', c); + } + dfa->utf8_anychar_classes[i] = CSET + charclass_index (c); + } + + /* A valid UTF-8 character is + + ([0x00-0x7f] + |[0xc2-0xdf][0x80-0xbf] + |[0xe0-0xef[0x80-0xbf][0x80-0xbf] + |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) + + which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f] + and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse + Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ + for (i = 1; i < n; i++) + addtok (dfa->utf8_anychar_classes[i]); + while (--i > 1) + { + addtok (dfa->utf8_anychar_classes[0]); + addtok (CAT); + addtok (OR); + } +} + +/* The grammar understood by the parser is as follows. + + regexp: + regexp OR branch + branch + + branch: + branch closure + closure + + closure: + closure QMARK + closure STAR + closure PLUS + closure REPMN + atom + + atom: + + + ANYCHAR + MBCSET + CSET + BACKREF + BEGLINE + ENDLINE + BEGWORD + ENDWORD + LIMWORD + NOTLIMWORD + LPAREN regexp RPAREN + + + The parser builds a parse tree in postfix form in an array of tokens. */ + +static void +atom (void) +{ + if (tok == WCHAR) + { + if (wctok == WEOF) + addtok (BACKREF); + else + { + addtok_wc (wctok); + + if (case_fold) + { + wchar_t folded[CASE_FOLDED_BUFSIZE]; + unsigned int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) + { + addtok_wc (folded[i]); + addtok (OR); + } + } + } + + tok = lex (); + } + else if (tok == ANYCHAR && using_utf8 ()) + { + /* For UTF-8 expand the period to a series of CSETs that define a valid + UTF-8 character. This avoids using the slow multibyte path. I'm + pretty sure it would be both profitable and correct to do it for + any encoding; however, the optimization must be done manually as + it is done above in add_utf8_anychar. So, let's start with + UTF-8: it is the most used, and the structure of the encoding + makes the correctness more obvious. */ + add_utf8_anychar (); + tok = lex (); + } + else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF + || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD + || tok == ANYCHAR || tok == MBCSET + || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) + { + addtok (tok); + tok = lex (); + } + else if (tok == LPAREN) + { + tok = lex (); + regexp (); + if (tok != RPAREN) + dfaerror (_("unbalanced (")); + tok = lex (); + } + else + addtok (EMPTY); +} + +/* Return the number of tokens in the given subexpression. */ +static size_t _GL_ATTRIBUTE_PURE +nsubtoks (size_t tindex) +{ + size_t ntoks1; + + switch (dfa->tokens[tindex - 1]) + { + default: + return 1; + case QMARK: + case STAR: + case PLUS: + return 1 + nsubtoks (tindex - 1); + case CAT: + case OR: + ntoks1 = nsubtoks (tindex - 1); + return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1); + } +} + +/* Copy the given subexpression to the top of the tree. */ +static void +copytoks (size_t tindex, size_t ntokens) +{ + size_t i; + + if (dfa->multibyte) + for (i = 0; i < ntokens; ++i) + addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); + else + for (i = 0; i < ntokens; ++i) + addtok_mb (dfa->tokens[tindex + i], 3); +} + +static void +closure (void) +{ + int i; + size_t tindex, ntokens; + + atom (); + while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) + if (tok == REPMN && (minrep || maxrep)) + { + ntokens = nsubtoks (dfa->tindex); + tindex = dfa->tindex - ntokens; + if (maxrep < 0) + addtok (PLUS); + if (minrep == 0) + addtok (QMARK); + for (i = 1; i < minrep; ++i) + { + copytoks (tindex, ntokens); + addtok (CAT); + } + for (; i < maxrep; ++i) + { + copytoks (tindex, ntokens); + addtok (QMARK); + addtok (CAT); + } + tok = lex (); + } + else if (tok == REPMN) + { + dfa->tindex -= nsubtoks (dfa->tindex); + tok = lex (); + closure (); + } + else + { + addtok (tok); + tok = lex (); + } +} + +static void +branch (void) +{ + closure (); + while (tok != RPAREN && tok != OR && tok >= 0) + { + closure (); + addtok (CAT); + } +} + +static void +regexp (void) +{ + branch (); + while (tok == OR) + { + tok = lex (); + branch (); + addtok (OR); + } +} + +/* Main entry point for the parser. S is a string to be parsed, len is the + length of the string, so s can include NUL characters. D is a pointer to + the struct dfa to parse into. */ +void +dfaparse (char const *s, size_t len, struct dfa *d) +{ + dfa = d; + lexptr = s; + lexleft = len; + lasttok = END; + laststart = true; + parens = 0; + if (dfa->multibyte) + { + cur_mb_len = 0; + memset (&d->mbs, 0, sizeof d->mbs); + } + + if (!syntax_bits_set) + dfaerror (_("no syntax specified")); + + tok = lex (); + depth = d->depth; + + regexp (); + + if (tok != END) + dfaerror (_("unbalanced )")); + + addtok (END - d->nregexps); + addtok (CAT); + + if (d->nregexps) + addtok (OR); + + ++d->nregexps; +} + +/* Some primitives for operating on sets of positions. */ + +/* Copy one set to another. */ +static void +copy (position_set const *src, position_set * dst) +{ + if (dst->alloc < src->nelem) + { + free (dst->elems); + dst->alloc = src->nelem; + dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems); + } + memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems); + dst->nelem = src->nelem; +} + +static void +alloc_position_set (position_set * s, size_t size) +{ + s->elems = xnmalloc (size, sizeof *s->elems); + s->alloc = size; + s->nelem = 0; +} + +/* Insert position P in set S. S is maintained in sorted order on + decreasing index. If there is already an entry in S with P.index + then merge (logically-OR) P's constraints into the one in S. + S->elems must point to an array large enough to hold the resulting set. */ +static void +insert (position p, position_set * s) +{ + size_t count = s->nelem; + size_t lo = 0, hi = count; + size_t i; + while (lo < hi) + { + size_t mid = (lo + hi) >> 1; + if (s->elems[mid].index > p.index) + lo = mid + 1; + else + hi = mid; + } + + if (lo < count && p.index == s->elems[lo].index) + { + s->elems[lo].constraint |= p.constraint; + return; + } + + s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems); + for (i = count; i > lo; i--) + s->elems[i] = s->elems[i - 1]; + s->elems[lo] = p; + ++s->nelem; +} + +/* Merge two sets of positions into a third. The result is exactly as if + the positions of both sets were inserted into an initially empty set. */ +static void +merge (position_set const *s1, position_set const *s2, position_set * m) +{ + size_t i = 0, j = 0; + + if (m->alloc < s1->nelem + s2->nelem) + { + free (m->elems); + m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc, + sizeof *m->elems); + } + m->nelem = 0; + while (i < s1->nelem && j < s2->nelem) + if (s1->elems[i].index > s2->elems[j].index) + m->elems[m->nelem++] = s1->elems[i++]; + else if (s1->elems[i].index < s2->elems[j].index) + m->elems[m->nelem++] = s2->elems[j++]; + else + { + m->elems[m->nelem] = s1->elems[i++]; + m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; + } + while (i < s1->nelem) + m->elems[m->nelem++] = s1->elems[i++]; + while (j < s2->nelem) + m->elems[m->nelem++] = s2->elems[j++]; +} + +/* Delete a position from a set. */ +static void +delete (position p, position_set * s) +{ + size_t i; + + for (i = 0; i < s->nelem; ++i) + if (p.index == s->elems[i].index) + break; + if (i < s->nelem) + for (--s->nelem; i < s->nelem; ++i) + s->elems[i] = s->elems[i + 1]; +} + +/* Find the index of the state corresponding to the given position set with + the given preceding context, or create a new state if there is no such + state. Context tells whether we got here on a newline or letter. */ +static state_num +state_index (struct dfa *d, position_set const *s, int context) +{ + size_t hash = 0; + int constraint; + state_num i, j; + + for (i = 0; i < s->nelem; ++i) + hash ^= s->elems[i].index + s->elems[i].constraint; + + /* Try to find a state that exactly matches the proposed one. */ + for (i = 0; i < d->sindex; ++i) + { + if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem + || context != d->states[i].context) + continue; + for (j = 0; j < s->nelem; ++j) + if (s->elems[j].constraint + != d->states[i].elems.elems[j].constraint + || s->elems[j].index != d->states[i].elems.elems[j].index) + break; + if (j == s->nelem) + return i; + } + +#ifdef DEBUG + fprintf (stderr, "new state %zd\n nextpos:", i); + for (j = 0; j < s->nelem; ++j) + { + fprintf (stderr, " %zu:", s->elems[j].index); + prtok (d->tokens[s->elems[j].index]); + } + fprintf (stderr, "\n context:"); + if (context ^ CTX_ANY) + { + if (context & CTX_NONE) + fprintf (stderr, " CTX_NONE"); + if (context & CTX_LETTER) + fprintf (stderr, " CTX_LETTER"); + if (context & CTX_NEWLINE) + fprintf (stderr, " CTX_NEWLINE"); + } + else + fprintf (stderr, " CTX_ANY"); + fprintf (stderr, "\n"); +#endif + + /* We'll have to create a new state. */ + d->states = maybe_realloc (d->states, d->sindex, &d->salloc, + sizeof *d->states); + d->states[i].hash = hash; + alloc_position_set (&d->states[i].elems, s->nelem); + copy (s, &d->states[i].elems); + d->states[i].context = context; + d->states[i].constraint = 0; + d->states[i].first_end = 0; + d->states[i].mbps.nelem = 0; + d->states[i].mbps.elems = NULL; + + for (j = 0; j < s->nelem; ++j) + if (d->tokens[s->elems[j].index] < 0) + { + constraint = s->elems[j].constraint; + if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY)) + d->states[i].constraint |= constraint; + if (!d->states[i].first_end) + d->states[i].first_end = d->tokens[s->elems[j].index]; + } + else if (d->tokens[s->elems[j].index] == BACKREF) + d->states[i].constraint = NO_CONSTRAINT; + + ++d->sindex; + + return i; +} + +/* Find the epsilon closure of a set of positions. If any position of the set + contains a symbol that matches the empty string in some context, replace + that position with the elements of its follow labeled with an appropriate + constraint. Repeat exhaustively until no funny positions are left. + S->elems must be large enough to hold the result. */ +static void +epsclosure (position_set *s, struct dfa const *d, char *visited) +{ + size_t i, j; + position p, old; + bool initialized = false; + + for (i = 0; i < s->nelem; ++i) + if (d->tokens[s->elems[i].index] >= NOTCHAR + && d->tokens[s->elems[i].index] != BACKREF + && d->tokens[s->elems[i].index] != ANYCHAR + && d->tokens[s->elems[i].index] != MBCSET + && d->tokens[s->elems[i].index] < CSET) + { + if (!initialized) + { + memset (visited, 0, d->tindex * sizeof (*visited)); + initialized = true; + } + old = s->elems[i]; + p.constraint = old.constraint; + delete (s->elems[i], s); + if (visited[old.index]) + { + --i; + continue; + } + visited[old.index] = 1; + switch (d->tokens[old.index]) + { + case BEGLINE: + p.constraint &= BEGLINE_CONSTRAINT; + break; + case ENDLINE: + p.constraint &= ENDLINE_CONSTRAINT; + break; + case BEGWORD: + p.constraint &= BEGWORD_CONSTRAINT; + break; + case ENDWORD: + p.constraint &= ENDWORD_CONSTRAINT; + break; + case LIMWORD: + p.constraint &= LIMWORD_CONSTRAINT; + break; + case NOTLIMWORD: + p.constraint &= NOTLIMWORD_CONSTRAINT; + break; + default: + break; + } + for (j = 0; j < d->follows[old.index].nelem; ++j) + { + p.index = d->follows[old.index].elems[j].index; + insert (p, s); + } + /* Force rescan to start at the beginning. */ + i = -1; + } +} + +/* Returns the set of contexts for which there is at least one + character included in C. */ + +static int +charclass_context (charclass c) +{ + int context = 0; + unsigned int j; + + if (tstbit (eolbyte, c)) + context |= CTX_NEWLINE; + + for (j = 0; j < CHARCLASS_WORDS; ++j) + { + if (c[j] & letters[j]) + context |= CTX_LETTER; + if (c[j] & ~(letters[j] | newline[j])) + context |= CTX_NONE; + } + + return context; +} + +/* Returns the contexts on which the position set S depends. Each context + in the set of returned contexts (let's call it SC) may have a different + follow set than other contexts in SC, and also different from the + follow set of the complement set (sc ^ CTX_ANY). However, all contexts + in the complement set will have the same follow set. */ + +static int _GL_ATTRIBUTE_PURE +state_separate_contexts (position_set const *s) +{ + int separate_contexts = 0; + size_t j; + + for (j = 0; j < s->nelem; ++j) + { + if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint)) + separate_contexts |= CTX_NEWLINE; + if (PREV_LETTER_DEPENDENT (s->elems[j].constraint)) + separate_contexts |= CTX_LETTER; + } + + return separate_contexts; +} + + +/* Perform bottom-up analysis on the parse tree, computing various functions. + Note that at this point, we're pretending constructs like \< are real + characters rather than constraints on what can follow them. + + Nullable: A node is nullable if it is at the root of a regexp that can + match the empty string. + * EMPTY leaves are nullable. + * No other leaf is nullable. + * A QMARK or STAR node is nullable. + * A PLUS node is nullable if its argument is nullable. + * A CAT node is nullable if both its arguments are nullable. + * An OR node is nullable if either argument is nullable. + + Firstpos: The firstpos of a node is the set of positions (nonempty leaves) + that could correspond to the first character of a string matching the + regexp rooted at the given node. + * EMPTY leaves have empty firstpos. + * The firstpos of a nonempty leaf is that leaf itself. + * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its + argument. + * The firstpos of a CAT node is the firstpos of the left argument, union + the firstpos of the right if the left argument is nullable. + * The firstpos of an OR node is the union of firstpos of each argument. + + Lastpos: The lastpos of a node is the set of positions that could + correspond to the last character of a string matching the regexp at + the given node. + * EMPTY leaves have empty lastpos. + * The lastpos of a nonempty leaf is that leaf itself. + * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its + argument. + * The lastpos of a CAT node is the lastpos of its right argument, union + the lastpos of the left if the right argument is nullable. + * The lastpos of an OR node is the union of the lastpos of each argument. + + Follow: The follow of a position is the set of positions that could + correspond to the character following a character matching the node in + a string matching the regexp. At this point we consider special symbols + that match the empty string in some context to be just normal characters. + Later, if we find that a special symbol is in a follow set, we will + replace it with the elements of its follow, labeled with an appropriate + constraint. + * Every node in the firstpos of the argument of a STAR or PLUS node is in + the follow of every node in the lastpos. + * Every node in the firstpos of the second argument of a CAT node is in + the follow of every node in the lastpos of the first argument. + + Because of the postfix representation of the parse tree, the depth-first + analysis is conveniently done by a linear scan with the aid of a stack. + Sets are stored as arrays of the elements, obeying a stack-like allocation + scheme; the number of elements in each set deeper in the stack can be + used to determine the address of a particular set's array. */ +void +dfaanalyze (struct dfa *d, int searchflag) +{ + /* Array allocated to hold position sets. */ + position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc); + /* Firstpos and lastpos elements. */ + position *firstpos = posalloc + d->nleaves; + position *lastpos = firstpos + d->nleaves; + + /* Stack for element counts and nullable flags. */ + struct + { + /* Whether the entry is nullable. */ + bool nullable; + + /* Counts of firstpos and lastpos sets. */ + size_t nfirstpos; + size_t nlastpos; + } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc; + + position_set tmp; /* Temporary set for merging sets. */ + position_set merged; /* Result of merging sets. */ + int separate_contexts; /* Context wanted by some position. */ + size_t i, j; + position *pos; + char *visited = xnmalloc (d->tindex, sizeof *visited); + +#ifdef DEBUG + fprintf (stderr, "dfaanalyze:\n"); + for (i = 0; i < d->tindex; ++i) + { + fprintf (stderr, " %zu:", i); + prtok (d->tokens[i]); + } + putc ('\n', stderr); +#endif + + d->searchflag = searchflag != 0; + alloc_position_set (&merged, d->nleaves); + d->follows = xcalloc (d->tindex, sizeof *d->follows); + + for (i = 0; i < d->tindex; ++i) + { + switch (d->tokens[i]) + { + case EMPTY: + /* The empty set is nullable. */ + stk->nullable = true; + + /* The firstpos and lastpos of the empty leaf are both empty. */ + stk->nfirstpos = stk->nlastpos = 0; + stk++; + break; + + case STAR: + case PLUS: + /* Every element in the firstpos of the argument is in the follow + of every element in the lastpos. */ + tmp.nelem = stk[-1].nfirstpos; + tmp.elems = firstpos; + pos = lastpos; + for (j = 0; j < stk[-1].nlastpos; ++j) + { + merge (&tmp, &d->follows[pos[j].index], &merged); + copy (&merged, &d->follows[pos[j].index]); + } + /* fallthrough */ + + case QMARK: + /* A QMARK or STAR node is automatically nullable. */ + if (d->tokens[i] != PLUS) + stk[-1].nullable = true; + break; + + case CAT: + /* Every element in the firstpos of the second argument is in the + follow of every element in the lastpos of the first argument. */ + tmp.nelem = stk[-1].nfirstpos; + tmp.elems = firstpos; + pos = lastpos + stk[-1].nlastpos; + for (j = 0; j < stk[-2].nlastpos; ++j) + { + merge (&tmp, &d->follows[pos[j].index], &merged); + copy (&merged, &d->follows[pos[j].index]); + } + + /* The firstpos of a CAT node is the firstpos of the first argument, + union that of the second argument if the first is nullable. */ + if (stk[-2].nullable) + stk[-2].nfirstpos += stk[-1].nfirstpos; + else + firstpos += stk[-1].nfirstpos; + + /* The lastpos of a CAT node is the lastpos of the second argument, + union that of the first argument if the second is nullable. */ + if (stk[-1].nullable) + stk[-2].nlastpos += stk[-1].nlastpos; + else + { + pos = lastpos + stk[-2].nlastpos; + for (j = stk[-1].nlastpos; j-- > 0;) + pos[j] = lastpos[j]; + lastpos += stk[-2].nlastpos; + stk[-2].nlastpos = stk[-1].nlastpos; + } + + /* A CAT node is nullable if both arguments are nullable. */ + stk[-2].nullable &= stk[-1].nullable; + stk--; + break; + + case OR: + /* The firstpos is the union of the firstpos of each argument. */ + stk[-2].nfirstpos += stk[-1].nfirstpos; + + /* The lastpos is the union of the lastpos of each argument. */ + stk[-2].nlastpos += stk[-1].nlastpos; + + /* An OR node is nullable if either argument is nullable. */ + stk[-2].nullable |= stk[-1].nullable; + stk--; + break; + + default: + /* Anything else is a nonempty position. (Note that special + constructs like \< are treated as nonempty strings here; + an "epsilon closure" effectively makes them nullable later. + Backreferences have to get a real position so we can detect + transitions on them later. But they are nullable. */ + stk->nullable = d->tokens[i] == BACKREF; + + /* This position is in its own firstpos and lastpos. */ + stk->nfirstpos = stk->nlastpos = 1; + stk++; + + --firstpos, --lastpos; + firstpos->index = lastpos->index = i; + firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; + + /* Allocate the follow set for this position. */ + alloc_position_set (&d->follows[i], 1); + break; + } +#ifdef DEBUG + /* ... balance the above nonsyntactic #ifdef goo... */ + fprintf (stderr, "node %zu:", i); + prtok (d->tokens[i]); + putc ('\n', stderr); + fprintf (stderr, + stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n"); + fprintf (stderr, " firstpos:"); + for (j = stk[-1].nfirstpos; j-- > 0;) + { + fprintf (stderr, " %zu:", firstpos[j].index); + prtok (d->tokens[firstpos[j].index]); + } + fprintf (stderr, "\n lastpos:"); + for (j = stk[-1].nlastpos; j-- > 0;) + { + fprintf (stderr, " %zu:", lastpos[j].index); + prtok (d->tokens[lastpos[j].index]); + } + putc ('\n', stderr); +#endif + } + + /* For each follow set that is the follow set of a real position, replace + it with its epsilon closure. */ + for (i = 0; i < d->tindex; ++i) + if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF + || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET + || d->tokens[i] >= CSET) + { +#ifdef DEBUG + fprintf (stderr, "follows(%zu:", i); + prtok (d->tokens[i]); + fprintf (stderr, "):"); + for (j = d->follows[i].nelem; j-- > 0;) + { + fprintf (stderr, " %zu:", d->follows[i].elems[j].index); + prtok (d->tokens[d->follows[i].elems[j].index]); + } + putc ('\n', stderr); +#endif + copy (&d->follows[i], &merged); + epsclosure (&merged, d, visited); + copy (&merged, &d->follows[i]); + } + + /* Get the epsilon closure of the firstpos of the regexp. The result will + be the set of positions of state 0. */ + merged.nelem = 0; + for (i = 0; i < stk[-1].nfirstpos; ++i) + insert (firstpos[i], &merged); + epsclosure (&merged, d, visited); + + /* Build the initial state. */ + separate_contexts = state_separate_contexts (&merged); + if (separate_contexts & CTX_NEWLINE) + state_index (d, &merged, CTX_NEWLINE); + d->initstate_others = d->min_trcount + = state_index (d, &merged, separate_contexts ^ CTX_ANY); + if (separate_contexts & CTX_LETTER) + d->initstate_letter = d->min_trcount + = state_index (d, &merged, CTX_LETTER); + else + d->initstate_letter = d->initstate_others; + d->min_trcount++; + + free (posalloc); + free (stkalloc); + free (merged.elems); + free (visited); +} + + +/* Find, for each character, the transition out of state s of d, and store + it in the appropriate slot of trans. + + We divide the positions of s into groups (positions can appear in more + than one group). Each group is labeled with a set of characters that + every position in the group matches (taking into account, if necessary, + preceding context information of s). For each group, find the union + of the its elements' follows. This set is the set of positions of the + new state. For each character in the group's label, set the transition + on this character to be to a state corresponding to the set's positions, + and its associated backward context information, if necessary. + + If we are building a searching matcher, we include the positions of state + 0 in every state. + + The collection of groups is constructed by building an equivalence-class + partition of the positions of s. + + For each position, find the set of characters C that it matches. Eliminate + any characters from C that fail on grounds of backward context. + + Search through the groups, looking for a group whose label L has nonempty + intersection with C. If L - C is nonempty, create a new group labeled + L - C and having the same positions as the current group, and set L to + the intersection of L and C. Insert the position in this group, set + C = C - L, and resume scanning. + + If after comparing with every group there are characters remaining in C, + create a new group labeled with the characters of C and insert this + position in that group. */ +void +dfastate (state_num s, struct dfa *d, state_num trans[]) +{ + leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */ + charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ + size_t ngrps = 0; /* Number of groups actually used. */ + position pos; /* Current position being considered. */ + charclass matches; /* Set of matching characters. */ + charclass_word matchesf; /* Nonzero if matches is nonempty. */ + charclass intersect; /* Intersection with some label set. */ + charclass_word intersectf; /* Nonzero if intersect is nonempty. */ + charclass leftovers; /* Stuff in the label that didn't match. */ + charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */ + position_set follows; /* Union of the follows of some group. */ + position_set tmp; /* Temporary space for merging sets. */ + int possible_contexts; /* Contexts that this group can match. */ + int separate_contexts; /* Context that new state wants to know. */ + state_num state; /* New state. */ + state_num state_newline; /* New state on a newline transition. */ + state_num state_letter; /* New state on a letter transition. */ + bool next_isnt_1st_byte = false; /* We can't add state0. */ + size_t i, j, k; + +#ifdef DEBUG + fprintf (stderr, "build state %td\n", s); +#endif + + zeroset (matches); + + for (i = 0; i < d->states[s].elems.nelem; ++i) + { + pos = d->states[s].elems.elems[i]; + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) + setbit (d->tokens[pos.index], matches); + else if (d->tokens[pos.index] >= CSET) + copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); + else + { + if (d->tokens[pos.index] == MBCSET + || d->tokens[pos.index] == ANYCHAR) + { + /* ANYCHAR and MBCSET must match with a single character, so we + must put it to d->states[s].mbps, which contains the positions + which can match with a single character not a byte. */ + if (d->states[s].mbps.nelem == 0) + alloc_position_set (&d->states[s].mbps, 1); + insert (pos, &(d->states[s].mbps)); + } + continue; + } + + /* Some characters may need to be eliminated from matches because + they fail in the current context. */ + if (pos.constraint != NO_CONSTRAINT) + { + if (!SUCCEEDS_IN_CONTEXT (pos.constraint, + d->states[s].context, CTX_NEWLINE)) + for (j = 0; j < CHARCLASS_WORDS; ++j) + matches[j] &= ~newline[j]; + if (!SUCCEEDS_IN_CONTEXT (pos.constraint, + d->states[s].context, CTX_LETTER)) + for (j = 0; j < CHARCLASS_WORDS; ++j) + matches[j] &= ~letters[j]; + if (!SUCCEEDS_IN_CONTEXT (pos.constraint, + d->states[s].context, CTX_NONE)) + for (j = 0; j < CHARCLASS_WORDS; ++j) + matches[j] &= letters[j] | newline[j]; + + /* If there are no characters left, there's no point in going on. */ + for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j) + continue; + if (j == CHARCLASS_WORDS) + continue; + } + +#ifdef DEBUG + fprintf (stderr, " nextpos %zu:", pos.index); + prtok (d->tokens[pos.index]); + fprintf (stderr, " of"); + for (j = 0; j < NOTCHAR; j++) + if (tstbit (j, matches)) + fprintf (stderr, " 0x%02zx", j); + fprintf (stderr, "\n"); +#endif + + for (j = 0; j < ngrps; ++j) + { + /* If matches contains a single character only, and the current + group's label doesn't contain that character, go on to the + next group. */ + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR + && !tstbit (d->tokens[pos.index], labels[j])) + continue; + + /* Check if this group's label has a nonempty intersection with + matches. */ + intersectf = 0; + for (k = 0; k < CHARCLASS_WORDS; ++k) + intersectf |= intersect[k] = matches[k] & labels[j][k]; + if (!intersectf) + continue; + + /* It does; now find the set differences both ways. */ + leftoversf = matchesf = 0; + for (k = 0; k < CHARCLASS_WORDS; ++k) + { + /* Even an optimizing compiler can't know this for sure. */ + charclass_word match = matches[k], label = labels[j][k]; + + leftoversf |= leftovers[k] = ~match & label; + matchesf |= matches[k] = match & ~label; + } + + /* If there were leftovers, create a new group labeled with them. */ + if (leftoversf) + { + copyset (leftovers, labels[ngrps]); + copyset (intersect, labels[j]); + grps[ngrps].elems = xnmalloc (d->nleaves, + sizeof *grps[ngrps].elems); + memcpy (grps[ngrps].elems, grps[j].elems, + sizeof (grps[j].elems[0]) * grps[j].nelem); + grps[ngrps].nelem = grps[j].nelem; + ++ngrps; + } + + /* Put the position in the current group. The constraint is + irrelevant here. */ + grps[j].elems[grps[j].nelem++] = pos.index; + + /* If every character matching the current position has been + accounted for, we're done. */ + if (!matchesf) + break; + } + + /* If we've passed the last group, and there are still characters + unaccounted for, then we'll have to create a new group. */ + if (j == ngrps) + { + copyset (matches, labels[ngrps]); + zeroset (matches); + grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems); + grps[ngrps].nelem = 1; + grps[ngrps].elems[0] = pos.index; + ++ngrps; + } + } + + alloc_position_set (&follows, d->nleaves); + alloc_position_set (&tmp, d->nleaves); + + /* If we are a searching matcher, the default transition is to a state + containing the positions of state 0, otherwise the default transition + is to fail miserably. */ + if (d->searchflag) + { + /* Find the state(s) corresponding to the positions of state 0. */ + copy (&d->states[0].elems, &follows); + separate_contexts = state_separate_contexts (&follows); + state = state_index (d, &follows, separate_contexts ^ CTX_ANY); + if (separate_contexts & CTX_NEWLINE) + state_newline = state_index (d, &follows, CTX_NEWLINE); + else + state_newline = state; + if (separate_contexts & CTX_LETTER) + state_letter = state_index (d, &follows, CTX_LETTER); + else + state_letter = state; + + for (i = 0; i < NOTCHAR; ++i) + trans[i] = unibyte_word_constituent (i) ? state_letter : state; + trans[eolbyte] = state_newline; + } + else + for (i = 0; i < NOTCHAR; ++i) + trans[i] = -1; + + for (i = 0; i < ngrps; ++i) + { + follows.nelem = 0; + + /* Find the union of the follows of the positions of the group. + This is a hideously inefficient loop. Fix it someday. */ + for (j = 0; j < grps[i].nelem; ++j) + for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) + insert (d->follows[grps[i].elems[j]].elems[k], &follows); + + if (d->multibyte) + { + /* If a token in follows.elems is not 1st byte of a multibyte + character, or the states of follows must accept the bytes + which are not 1st byte of the multibyte character. + Then, if a state of follows encounter a byte, it must not be + a 1st byte of a multibyte character nor single byte character. + We cansel to add state[0].follows to next state, because + state[0] must accept 1st-byte + + For example, we assume is a certain single byte + character, is a certain multibyte character, and the + codepoint of equals the 2nd byte of the codepoint of + . + When state[0] accepts , state[i] transit to state[i+1] + by accepting accepts 1st byte of , and state[i+1] + accepts 2nd byte of , if state[i+1] encounter the + codepoint of , it must not be but 2nd byte of + , so we cannot add state[0]. */ + + next_isnt_1st_byte = false; + for (j = 0; j < follows.nelem; ++j) + { + if (!(d->multibyte_prop[follows.elems[j].index] & 1)) + { + next_isnt_1st_byte = true; + break; + } + } + } + + /* If we are building a searching matcher, throw in the positions + of state 0 as well. */ + if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) + { + merge (&d->states[0].elems, &follows, &tmp); + copy (&tmp, &follows); + } + + /* Find out if the new state will want any context information. */ + possible_contexts = charclass_context (labels[i]); + separate_contexts = state_separate_contexts (&follows); + + /* Find the state(s) corresponding to the union of the follows. */ + if ((separate_contexts & possible_contexts) != possible_contexts) + state = state_index (d, &follows, separate_contexts ^ CTX_ANY); + else + state = -1; + if (separate_contexts & possible_contexts & CTX_NEWLINE) + state_newline = state_index (d, &follows, CTX_NEWLINE); + else + state_newline = state; + if (separate_contexts & possible_contexts & CTX_LETTER) + state_letter = state_index (d, &follows, CTX_LETTER); + else + state_letter = state; + +#ifdef DEBUG + fprintf (stderr, "group %zu\n nextpos:", i); + for (j = 0; j < grps[i].nelem; ++j) + { + fprintf (stderr, " %zu:", grps[i].elems[j]); + prtok (d->tokens[grps[i].elems[j]]); + } + fprintf (stderr, "\n follows:"); + for (j = 0; j < follows.nelem; ++j) + { + fprintf (stderr, " %zu:", follows.elems[j].index); + prtok (d->tokens[follows.elems[j].index]); + } + fprintf (stderr, "\n states:"); + if (possible_contexts & CTX_NEWLINE) + fprintf (stderr, " CTX_NEWLINE:%td", state_newline); + if (possible_contexts & CTX_LETTER) + fprintf (stderr, " CTX_LETTER:%td", state_letter); + if (possible_contexts & CTX_NONE) + fprintf (stderr, " CTX_NONE:%td", state); + fprintf (stderr, "\n"); +#endif + + /* Set the transitions for each character in the current label. */ + for (j = 0; j < CHARCLASS_WORDS; ++j) + for (k = 0; k < CHARCLASS_WORD_BITS; ++k) + if (labels[i][j] >> k & 1) + { + int c = j * CHARCLASS_WORD_BITS + k; + + if (c == eolbyte) + trans[c] = state_newline; + else if (unibyte_word_constituent (c)) + trans[c] = state_letter; + else if (c < NOTCHAR) + trans[c] = state; + } + } + +#ifdef DEBUG + fprintf (stderr, "trans table %td", s); + for (i = 0; i < NOTCHAR; ++i) + { + if (!(i & 0xf)) + fprintf (stderr, "\n"); + fprintf (stderr, " %2td", trans[i]); + } + fprintf (stderr, "\n"); +#endif + + for (i = 0; i < ngrps; ++i) + free (grps[i].elems); + free (follows.elems); + free (tmp.elems); +} + +/* Make sure D's state arrays are large enough to hold NEW_STATE. */ +static void +realloc_trans_if_necessary (struct dfa *d, state_num new_state) +{ + state_num oldalloc = d->tralloc; + if (oldalloc <= new_state) + { + state_num **realtrans = d->trans ? d->trans - 1 : NULL; + size_t newalloc, newalloc1; + newalloc1 = new_state + 1; + realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans); + realtrans[0] = NULL; + d->trans = realtrans + 1; + d->tralloc = newalloc = newalloc1 - 1; + d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); + d->success = xnrealloc (d->success, newalloc, sizeof *d->success); + d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); + for (; oldalloc < newalloc; oldalloc++) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc] = NULL; + } + } +} + +/* Some routines for manipulating a compiled dfa's transition tables. + Each state may or may not have a transition table; if it does, and it + is a non-accepting state, then d->trans[state] points to its table. + If it is an accepting state then d->fails[state] points to its table. + If it has no table at all, then d->trans[state] is NULL. + TODO: Improve this comment, get rid of the unnecessary redundancy. */ + +static void +build_state (state_num s, struct dfa *d) +{ + state_num *trans; /* The new transition table. */ + state_num i, maxstate; + + /* Set an upper limit on the number of transition tables that will ever + exist at once. 1024 is arbitrary. The idea is that the frequently + used transition tables will be quickly rebuilt, whereas the ones that + were only needed once or twice will be cleared away. However, do not + clear the initial D->min_trcount states, since they are always used. */ + if (d->trcount >= 1024) + { + for (i = d->min_trcount; i < d->tralloc; ++i) + { + free (d->trans[i]); + free (d->fails[i]); + d->trans[i] = d->fails[i] = NULL; + } + d->trcount = d->min_trcount; + } + + ++d->trcount; + + /* Set up the success bits for this state. */ + d->success[s] = 0; + if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d)) + d->success[s] |= CTX_NEWLINE; + if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d)) + d->success[s] |= CTX_LETTER; + if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d)) + d->success[s] |= CTX_NONE; + + trans = xmalloc (NOTCHAR * sizeof *trans); + dfastate (s, d, trans); + + /* Now go through the new transition table, and make sure that the trans + and fail arrays are allocated large enough to hold a pointer for the + largest state mentioned in the table. */ + maxstate = -1; + for (i = 0; i < NOTCHAR; ++i) + if (maxstate < trans[i]) + maxstate = trans[i]; + realloc_trans_if_necessary (d, maxstate); + + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans[eolbyte]; + trans[eolbyte] = -1; + + if (ACCEPTING (s, *d)) + d->fails[s] = trans; + else + d->trans[s] = trans; +} + +/* Multibyte character handling sub-routines for dfaexec. */ + +/* Return values of transit_state_singlebyte, and + transit_state_consume_1char. */ +typedef enum +{ + TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ + TRANSIT_STATE_DONE, /* State transition has finished. */ + TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ +} status_transit_state; + +/* Consume a single byte and transit state from 's' to '*next_state'. + This function is almost same as the state transition routin in dfaexec. + But state transition is done just once, otherwise matching succeed or + reach the end of the buffer. */ +static status_transit_state +transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, + state_num * next_state) +{ + state_num *t; + state_num works = s; + + status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; + + while (rval == TRANSIT_STATE_IN_PROGRESS) + { + if ((t = d->trans[works]) != NULL) + { + works = t[*p]; + rval = TRANSIT_STATE_DONE; + if (works < 0) + works = 0; + } + else if (works < 0) + works = 0; + else if (d->fails[works]) + { + works = d->fails[works][*p]; + rval = TRANSIT_STATE_DONE; + } + else + { + build_state (works, d); + } + } + *next_state = works; + return rval; +} + +/* Match a "." against the current context. Return the length of the + match, in bytes. POS is the position of the ".". */ +static int +match_anychar (struct dfa *d, state_num s, position pos, + wint_t wc, size_t mbclen) +{ + int context; + + /* Check syntax bits. */ + if (wc == (wchar_t) '\n') + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + return 0; + } + else if (wc == (wchar_t) '\0') + { + if (syntax_bits & RE_DOT_NOT_NULL) + return 0; + } + else if (wc == WEOF) + return 0; + + context = wchar_context (wc); + if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context)) + return 0; + + return mbclen; +} + +/* Check whether each of 'd->states[s].mbps.elem' can match. Then return the + array which corresponds to 'd->states[s].mbps.elem'; each element of the + array contains the number of bytes with which the element can match. + + The caller MUST free the array which this function return. */ +static int * +check_matching_with_multibyte_ops (struct dfa *d, state_num s, + char const *p, wint_t wc, size_t mbclen) +{ + size_t i; + int *rarray; + + rarray = d->mb_match_lens; + for (i = 0; i < d->states[s].mbps.nelem; ++i) + { + position pos = d->states[s].mbps.elems[i]; + switch (d->tokens[pos.index]) + { + case ANYCHAR: + rarray[i] = match_anychar (d, s, pos, wc, mbclen); + break; + default: + break; /* cannot happen. */ + } + } + return rarray; +} + +/* Consume a single character and enumerate all of the positions which can + be the next position from the state 's'. + + 'match_lens' is the input. It can be NULL, but it can also be the output + of check_matching_with_multibyte_ops for optimization. + + 'mbclen' and 'pps' are the output. 'mbclen' is the length of the + character consumed, and 'pps' is the set this function enumerates. */ +static status_transit_state +transit_state_consume_1char (struct dfa *d, state_num s, + unsigned char const **pp, + wint_t wc, size_t mbclen, + int *match_lens) +{ + size_t i, j; + int k; + state_num s1, s2; + status_transit_state rs = TRANSIT_STATE_DONE; + + if (! match_lens && d->states[s].mbps.nelem != 0) + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, + wc, mbclen); + + /* Calculate the state which can be reached from the state 's' by + consuming 'mbclen' single bytes from the buffer. */ + s1 = s; + for (k = 0; k < mbclen; k++) + { + s2 = s1; + rs = transit_state_singlebyte (d, s2, (*pp)++, &s1); + } + copy (&d->states[s1].elems, &d->mb_follows); + + /* Add all of the positions which can be reached from 's' by consuming + a single character. */ + for (i = 0; i < d->states[s].mbps.nelem; i++) + { + if (match_lens[i] == mbclen) + for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; + j++) + insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], + &d->mb_follows); + } + + /* FIXME: this return value is always ignored. */ + return rs; +} + +/* Transit state from s, then return new state and update the pointer of the + buffer. This function is for some operator which can match with a multi- + byte character or a collating element (which may be multi characters). */ +static state_num +transit_state (struct dfa *d, state_num s, unsigned char const **pp, + unsigned char const *end) +{ + state_num s1; + int mbclen; /* The length of current input multibyte character. */ + int maxlen = 0; + size_t i, j; + int *match_lens = NULL; + size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */ + unsigned char const *p1 = *pp; + wint_t wc; + + if (nelem > 0) + /* This state has (a) multibyte operator(s). + We check whether each of them can match or not. */ + { + /* Note: caller must free the return value of this function. */ + mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, + wc, mbclen); + + for (i = 0; i < nelem; i++) + /* Search the operator which match the longest string, + in this state. */ + { + if (match_lens[i] > maxlen) + maxlen = match_lens[i]; + } + } + + if (nelem == 0 || maxlen == 0) + /* This state has no multibyte operator which can match. + We need to check only one single byte character. */ + { + status_transit_state rs; + rs = transit_state_singlebyte (d, s, *pp, &s1); + + /* We must update the pointer if state transition succeeded. */ + if (rs == TRANSIT_STATE_DONE) + ++*pp; + + return s1; + } + + /* This state has some operators which can match a multibyte character. */ + d->mb_follows.nelem = 0; + + /* 'maxlen' may be longer than the length of a character, because it may + not be a character but a (multi character) collating element. + We enumerate all of the positions which 's' can reach by consuming + 'maxlen' bytes. */ + transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens); + + s1 = state_index (d, &d->mb_follows, wchar_context (wc)); + realloc_trans_if_necessary (d, s1); + + while (*pp - p1 < maxlen) + { + mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); + transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL); + + for (i = 0; i < nelem; i++) + { + if (match_lens[i] == *pp - p1) + for (j = 0; + j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) + insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j], + &d->mb_follows); + } + + s1 = state_index (d, &d->mb_follows, wchar_context (wc)); + realloc_trans_if_necessary (d, s1); + } + return s1; +} + +/* The initial state may encounter a byte which is not a single byte character + nor the first byte of a multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, in Shift JIS the regular + expression "\\" accepts the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial state must skip the bytes + that are not a single byte character nor the first byte of a multibyte + character. + + Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or + exceeds P. If WCP is non-NULL, set *WCP to the final wide character + processed, or if no wide character is processed, set it to WEOF. + Both P and MBP must be no larger than END. */ +static unsigned char const * +skip_remains_mb (struct dfa *d, unsigned char const *p, + unsigned char const *mbp, char const *end, wint_t *wcp) +{ + wint_t wc = WEOF; + while (mbp < p) + mbp += mbs_to_wchar (&wc, (char const *) mbp, + end - (char const *) mbp, d); + if (wcp != NULL) + *wcp = wc; + return mbp; +} + +/* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the + buffer, and the shortest possible version thereof. Return a pointer to + the first character after the match, or NULL if none is found. BEGIN + points to the beginning of the buffer, and END points to the first byte + after its end. Note however that we store a sentinel byte (usually + newline) in *END, so the actual buffer must be one byte longer. + When ALLOW_NL is nonzero, newlines may appear in the matching string. + If COUNT is non-NULL, increment *COUNT once for each newline processed. + Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we + encountered a DFA-unfriendly construct. The caller may use this to + decide whether to fall back on a matcher like regex. If MULTIBYTE, + the input consists of multibyte characters and/or encoding-error bytes. + Otherwise, the input consists of single-byte characters. + Here is the list of features that make this DFA matcher punt: + - [M-N]-range-in-MB-locale: regex is up to 25% faster on [a-z] + - back-reference: (.)\1 + - word-delimiter-in-MB-locale: \<, \>, \b + */ +static inline char * +dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, + size_t *count, bool multibyte) +{ + state_num s, s1; /* Current state. */ + unsigned char const *p, *mbp; /* Current input character. */ + state_num **trans, *t; /* Copy of d->trans so it can be optimized + into a register. */ + unsigned char eol = eolbyte; /* Likewise for eolbyte. */ + unsigned char saved_end; + size_t nlcount = 0; + + if (!d->tralloc) + { + realloc_trans_if_necessary (d, 1); + build_state (0, d); + } + + s = s1 = 0; + p = mbp = (unsigned char const *) begin; + trans = d->trans; + saved_end = *(unsigned char *) end; + *end = eol; + + if (multibyte) + { + memset (&d->mbs, 0, sizeof d->mbs); + if (! d->mb_match_lens) + { + d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens); + alloc_position_set (&d->mb_follows, d->nleaves); + } + } + + for (;;) + { + if (multibyte) + { + while ((t = trans[s]) != NULL) + { + s1 = s; + + if (s < d->min_trcount) + { + if (d->min_trcount == 1) + { + if (d->states[s].mbps.nelem == 0) + { + do + { + while (t[*p] == 0) + p++; + p = mbp = skip_remains_mb (d, p, mbp, end, NULL); + } + while (t[*p] == 0); + } + else + p = mbp = skip_remains_mb (d, p, mbp, end, NULL); + } + else + { + wint_t wc; + mbp = skip_remains_mb (d, p, mbp, end, &wc); + + /* If d->min_trcount is greater than 1, maybe + transit to another initial state after skip. */ + if (p < mbp) + { + int context = wchar_context (wc); + if (context == CTX_LETTER) + s = d->initstate_letter; + else + /* It's CTX_NONE. CTX_NEWLINE cannot happen, + as we assume that a newline is always a + single byte character. */ + s = d->initstate_others; + p = mbp; + s1 = s; + } + } + } + + if (d->states[s].mbps.nelem == 0) + { + s = t[*p++]; + continue; + } + + /* The following code is used twice. + Use a macro to avoid the risk that they diverge. */ +#define State_transition() \ + do { \ + /* Can match with a multibyte character (and multi-character \ + collating element). Transition table might be updated. */ \ + s = transit_state (d, s, &p, (unsigned char *) end); \ + \ + /* If previous character is newline after a transition \ + for ANYCHAR or MBCSET in non-UTF8 multibyte locales, \ + check whether current position is beyond the end of \ + the input buffer. Also, transit to initial state if \ + !ALLOW_NL, even if RE_DOT_NEWLINE is set. */ \ + if (p[-1] == eol) \ + { \ + if ((char *) p > end) \ + { \ + p = NULL; \ + goto done; \ + } \ + \ + nlcount++; \ + \ + if (!allow_nl) \ + s = 0; \ + } \ + \ + mbp = p; \ + trans = d->trans; \ + } while (0) + + State_transition(); + } + } + else + { + if (s == 0 && (t = trans[s]) != NULL) + { + while (t[*p] == 0) + p++; + s1 = 0; + s = t[*p++]; + } + + while ((t = trans[s]) != NULL) + { + s1 = t[*p++]; + if ((t = trans[s1]) == NULL) + { + state_num tmp = s; + s = s1; + s1 = tmp; /* swap */ + break; + } + s = t[*p++]; + } + } + + if (s < 0) + { + if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0) + { + p = NULL; + goto done; + } + + /* The previous character was a newline, count it, and skip + checking of multibyte character boundary until here. */ + nlcount++; + mbp = p; + + s = allow_nl ? d->newlines[s1] : 0; + } + + if (d->fails[s]) + { + if (d->success[s] & sbit[*p]) + goto done; + + s1 = s; + if (multibyte) + State_transition(); + else + s = d->fails[s][*p++]; + } + else + { + if (!d->trans[s]) + build_state (s, d); + trans = d->trans; + } + } + + done: + if (count) + *count += nlcount; + *end = saved_end; + return (char *) p; +} + +/* Specialized versions of dfaexec_main for multibyte and single-byte + cases. This is for performance. */ + +static char * +dfaexec_mb (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return dfaexec_main (d, begin, end, allow_nl, count, true); +} + +static char * +dfaexec_sb (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return dfaexec_main (d, begin, end, allow_nl, count, false); +} + +/* Always set *BACKREF and return BEGIN. Use this wrapper for + any regexp that uses a construct not supported by this code. */ +static char * +dfaexec_noop (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + *backref = 1; + return (char *) begin; +} + +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte), + but faster. */ + +char * +dfaexec (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return d->dfaexec (d, begin, end, allow_nl, count, backref); +} + +struct dfa * +dfasuperset (struct dfa const *d) +{ + return d->superset; +} + +bool +dfaisfast (struct dfa const *d) +{ + return d->fast; +} + +static void +free_mbdata (struct dfa *d) +{ + size_t i; + + free (d->multibyte_prop); + + for (i = 0; i < d->nmbcsets; ++i) + { + struct mb_char_classes *p = &(d->mbcsets[i]); + free (p->chars); + } + + free (d->mbcsets); + free (d->mb_follows.elems); + free (d->mb_match_lens); + d->mb_match_lens = NULL; +} + +/* Initialize the components of a dfa that the other routines don't + initialize for themselves. */ +void +dfainit (struct dfa *d) +{ + memset (d, 0, sizeof *d); + d->multibyte = MB_CUR_MAX > 1; + d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; + d->fast = !d->multibyte; +} + +/* Return true if every construct in D is supported by this DFA matcher. */ +static bool _GL_ATTRIBUTE_PURE +dfa_supported (struct dfa const *d) +{ + for (size_t i = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (!d->multibyte) + continue; + /* fallthrough */ + + case BACKREF: + case MBCSET: + return false; + } + } + return true; +} + +static void +dfaoptimize (struct dfa *d) +{ + size_t i; + bool have_backref = false; + + if (!using_utf8 ()) + return; + + for (i = 0; i < d->tindex; ++i) + { + switch (d->tokens[i]) + { + case ANYCHAR: + /* Lowered. */ + abort (); + case BACKREF: + have_backref = true; + break; + case MBCSET: + /* Requires multi-byte algorithm. */ + return; + default: + break; + } + } + + if (!have_backref && d->superset) + { + /* The superset DFA is not likely to be much faster, so remove it. */ + dfafree (d->superset); + free (d->superset); + d->superset = NULL; + } + + free_mbdata (d); + d->multibyte = false; + d->dfaexec = dfaexec_sb; +} + +static void +dfassbuild (struct dfa *d) +{ + size_t i, j; + charclass ccl; + bool have_achar = false; + bool have_nchar = false; + struct dfa *sup = dfaalloc (); + + *sup = *d; + sup->multibyte = false; + sup->dfaexec = dfaexec_sb; + sup->multibyte_prop = NULL; + sup->mbcsets = NULL; + sup->superset = NULL; + sup->states = NULL; + sup->sindex = 0; + sup->follows = NULL; + sup->tralloc = 0; + sup->trans = NULL; + sup->fails = NULL; + sup->success = NULL; + sup->newlines = NULL; + + sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); + if (d->cindex) + { + memcpy (sup->charclasses, d->charclasses, + d->cindex * sizeof *sup->charclasses); + } + + sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); + sup->talloc = d->tindex * 2; + + for (i = j = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case ANYCHAR: + case MBCSET: + case BACKREF: + zeroset (ccl); + notset (ccl); + sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = STAR; + if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR + || d->tokens[i + 1] == PLUS) + i++; + have_achar = true; + break; + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (d->multibyte) + { + /* These constraints aren't supported in a multibyte locale. + Ignore them in the superset DFA. */ + sup->tokens[j++] = EMPTY; + break; + } + default: + sup->tokens[j++] = d->tokens[i]; + if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR) + || d->tokens[i] >= CSET) + have_nchar = true; + break; + } + } + sup->tindex = j; + + if (have_nchar && (have_achar || d->multibyte)) + d->superset = sup; + else + { + dfafree (sup); + free (sup); + } +} + +/* Parse and analyze a single string of the given length. */ +void +dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) +{ + dfainit (d); + dfaparse (s, len, d); + dfassbuild (d); + + if (dfa_supported (d)) + { + dfaoptimize (d); + dfaanalyze (d, searchflag); + } + else + { + d->dfaexec = dfaexec_noop; + } + + if (d->superset) + { + d->fast = true; + dfaanalyze (d->superset, searchflag); + } +} + +/* Free the storage held by the components of a dfa. */ +void +dfafree (struct dfa *d) +{ + size_t i; + + free (d->charclasses); + free (d->tokens); + + if (d->multibyte) + free_mbdata (d); + + for (i = 0; i < d->sindex; ++i) + { + free (d->states[i].elems.elems); + free (d->states[i].mbps.elems); + } + free (d->states); + + if (d->follows) + { + for (i = 0; i < d->tindex; ++i) + free (d->follows[i].elems); + free (d->follows); + } + + if (d->trans) + { + for (i = 0; i < d->tralloc; ++i) + { + free (d->trans[i]); + free (d->fails[i]); + } + + free (d->trans - 1); + free (d->fails); + free (d->newlines); + free (d->success); + } + + if (d->superset) + dfafree (d->superset); +} + +/* Having found the postfix representation of the regular expression, + try to find a long sequence of characters that must appear in any line + containing the r.e. + Finding a "longest" sequence is beyond the scope here; + we take an easy way out and hope for the best. + (Take "(ab|a)b"--please.) + + We do a bottom-up calculation of sequences of characters that must appear + in matches of r.e.'s represented by trees rooted at the nodes of the postfix + representation: + sequences that must appear at the left of the match ("left") + sequences that must appear at the right of the match ("right") + lists of sequences that must appear somewhere in the match ("in") + sequences that must constitute the match ("is") + + When we get to the root of the tree, we use one of the longest of its + calculated "in" sequences as our answer. + + The sequences calculated for the various types of node (in pseudo ANSI c) + are shown below. "p" is the operand of unary operators (and the left-hand + operand of binary operators); "q" is the right-hand operand of binary + operators. + + "ZERO" means "a zero-length sequence" below. + + Type left right is in + ---- ---- ----- -- -- + char c # c # c # c # c + + ANYCHAR ZERO ZERO ZERO ZERO + + MBCSET ZERO ZERO ZERO ZERO + + CSET ZERO ZERO ZERO ZERO + + STAR ZERO ZERO ZERO ZERO + + QMARK ZERO ZERO ZERO ZERO + + PLUS p->left p->right ZERO p->in + + CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus + p->left : q->right : q->is!=ZERO) ? q->in plus + p->is##q->left p->right##q->is p->is##q->is : p->right##q->left + ZERO + + OR longest common longest common (do p->is and substrings common + leading trailing to q->is have same p->in and + (sub)sequence (sub)sequence q->in length and content) ? + of p->left of p->right + and q->left and q->right p->is : NULL + + If there's anything else we recognize in the tree, all four sequences get set + to zero-length sequences. If there's something we don't recognize in the + tree, we just return a zero-length sequence. + + Break ties in favor of infrequent letters (choosing 'zzz' in preference to + 'aaa')? + + And ... is it here or someplace that we might ponder "optimizations" such as + egrep 'psi|epsilon' -> egrep 'psi' + egrep 'pepsi|epsilon' -> egrep 'epsi' + (Yes, we now find "epsi" as a "string + that must occur", but we might also + simplify the *entire* r.e. being sought) + grep '[c]' -> grep 'c' + grep '(ab|a)b' -> grep 'ab' + grep 'ab*' -> grep 'a' + grep 'a*b' -> grep 'b' + + There are several issues: + + Is optimization easy (enough)? + + Does optimization actually accomplish anything, + or is the automaton you get from "psi|epsilon" (for example) + the same as the one you get from "psi" (for example)? + + Are optimizable r.e.'s likely to be used in real-life situations + (something like 'ab*' is probably unlikely; something like is + 'psi|epsilon' is likelier)? */ + +static char * +icatalloc (char *old, char const *new) +{ + char *result; + size_t oldsize; + size_t newsize = strlen (new); + if (newsize == 0) + return old; + oldsize = strlen (old); + result = xrealloc (old, oldsize + newsize + 1); + memcpy (result + oldsize, new, newsize + 1); + return result; +} + +static void +freelist (char **cpp) +{ + while (*cpp) + free (*cpp++); +} + +static char ** +enlist (char **cpp, char *new, size_t len) +{ + size_t i, j; + new = memcpy (xmalloc (len + 1), new, len); + new[len] = '\0'; + /* Is there already something in the list that's new (or longer)? */ + for (i = 0; cpp[i] != NULL; ++i) + if (strstr (cpp[i], new) != NULL) + { + free (new); + return cpp; + } + /* Eliminate any obsoleted strings. */ + j = 0; + while (cpp[j] != NULL) + if (strstr (new, cpp[j]) == NULL) + ++j; + else + { + free (cpp[j]); + if (--i == j) + break; + cpp[j] = cpp[i]; + cpp[i] = NULL; + } + /* Add the new string. */ + cpp = xnrealloc (cpp, i + 2, sizeof *cpp); + cpp[i] = new; + cpp[i + 1] = NULL; + return cpp; +} + +/* Given pointers to two strings, return a pointer to an allocated + list of their distinct common substrings. */ +static char ** +comsubs (char *left, char const *right) +{ + char **cpp = xzalloc (sizeof *cpp); + char *lcp; + + for (lcp = left; *lcp != '\0'; ++lcp) + { + size_t len = 0; + char *rcp = strchr (right, *lcp); + while (rcp != NULL) + { + size_t i; + for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) + continue; + if (i > len) + len = i; + rcp = strchr (rcp + 1, *lcp); + } + if (len != 0) + cpp = enlist (cpp, lcp, len); + } + return cpp; +} + +static char ** +addlists (char **old, char **new) +{ + for (; *new; new++) + old = enlist (old, *new, strlen (*new)); + return old; +} + +/* Given two lists of substrings, return a new list giving substrings + common to both. */ +static char ** +inboth (char **left, char **right) +{ + char **both = xzalloc (sizeof *both); + size_t lnum, rnum; + + for (lnum = 0; left[lnum] != NULL; ++lnum) + { + for (rnum = 0; right[rnum] != NULL; ++rnum) + { + char **temp = comsubs (left[lnum], right[rnum]); + both = addlists (both, temp); + freelist (temp); + free (temp); + } + } + return both; +} + +typedef struct must must; + +struct must +{ + char **in; + char *left; + char *right; + char *is; + bool begline; + bool endline; + must *prev; +}; + +static must * +allocmust (must *mp, size_t size) +{ + must *new_mp = xmalloc (sizeof *new_mp); + new_mp->in = xzalloc (sizeof *new_mp->in); + new_mp->left = xzalloc (size); + new_mp->right = xzalloc (size); + new_mp->is = xzalloc (size); + new_mp->begline = false; + new_mp->endline = false; + new_mp->prev = mp; + return new_mp; +} + +static void +resetmust (must *mp) +{ + freelist (mp->in); + mp->in[0] = NULL; + mp->left[0] = mp->right[0] = mp->is[0] = '\0'; + mp->begline = false; + mp->endline = false; +} + +static void +freemust (must *mp) +{ + freelist (mp->in); + free (mp->in); + free (mp->left); + free (mp->right); + free (mp->is); + free (mp); +} + +struct dfamust * +dfamust (struct dfa const *d) +{ + must *mp = NULL; + char const *result = ""; + size_t i; + bool exact = false; + bool begline = false; + bool endline = false; + bool need_begline = false; + bool need_endline = false; + bool case_fold_unibyte = case_fold && MB_CUR_MAX == 1; + + for (size_t ri = 0; ri < d->tindex; ++ri) + { + token t = d->tokens[ri]; + switch (t) + { + case BEGLINE: + mp = allocmust (mp, 2); + mp->begline = true; + need_begline = true; + break; + case ENDLINE: + mp = allocmust (mp, 2); + mp->endline = true; + need_endline = true; + break; + case LPAREN: + case RPAREN: + assert (!"neither LPAREN nor RPAREN may appear here"); + + case EMPTY: + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + case BACKREF: + case ANYCHAR: + case MBCSET: + mp = allocmust (mp, 2); + break; + + case STAR: + case QMARK: + resetmust (mp); + break; + + case OR: + { + char **new; + must *rmp = mp; + must *lmp = mp = mp->prev; + size_t j, ln, rn, n; + + /* Guaranteed to be. Unlikely, but ... */ + if (STREQ (lmp->is, rmp->is)) + { + lmp->begline &= rmp->begline; + lmp->endline &= rmp->endline; + } + else + { + lmp->is[0] = '\0'; + lmp->begline = false; + lmp->endline = false; + } + /* Left side--easy */ + i = 0; + while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) + ++i; + lmp->left[i] = '\0'; + /* Right side */ + ln = strlen (lmp->right); + rn = strlen (rmp->right); + n = ln; + if (n > rn) + n = rn; + for (i = 0; i < n; ++i) + if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) + break; + for (j = 0; j < i; ++j) + lmp->right[j] = lmp->right[(ln - i) + j]; + lmp->right[j] = '\0'; + new = inboth (lmp->in, rmp->in); + freelist (lmp->in); + free (lmp->in); + lmp->in = new; + freemust (rmp); + } + break; + + case PLUS: + mp->is[0] = '\0'; + break; + + case END: + assert (!mp->prev); + for (i = 0; mp->in[i] != NULL; ++i) + if (strlen (mp->in[i]) > strlen (result)) + result = mp->in[i]; + if (STREQ (result, mp->is)) + { + if ((!need_begline || mp->begline) && (!need_endline + || mp->endline)) + exact = true; + begline = mp->begline; + endline = mp->endline; + } + goto done; + + case CAT: + { + must *rmp = mp; + must *lmp = mp = mp->prev; + + /* In. Everything in left, plus everything in + right, plus concatenation of + left's right and right's left. */ + lmp->in = addlists (lmp->in, rmp->in); + if (lmp->right[0] != '\0' && rmp->left[0] != '\0') + { + size_t lrlen = strlen (lmp->right); + size_t rllen = strlen (rmp->left); + char *tp = xmalloc (lrlen + rllen); + memcpy (tp, lmp->right, lrlen); + memcpy (tp + lrlen, rmp->left, rllen); + lmp->in = enlist (lmp->in, tp, lrlen + rllen); + free (tp); + } + /* Left-hand */ + if (lmp->is[0] != '\0') + lmp->left = icatalloc (lmp->left, rmp->left); + /* Right-hand */ + if (rmp->is[0] == '\0') + lmp->right[0] = '\0'; + lmp->right = icatalloc (lmp->right, rmp->right); + /* Guaranteed to be */ + if ((lmp->is[0] != '\0' || lmp->begline) + && (rmp->is[0] != '\0' || rmp->endline)) + { + lmp->is = icatalloc (lmp->is, rmp->is); + lmp->endline = rmp->endline; + } + else + { + lmp->is[0] = '\0'; + lmp->begline = false; + lmp->endline = false; + } + freemust (rmp); + } + break; + + case '\0': + /* Not on *my* shift. */ + goto done; + + default: + if (CSET <= t) + { + /* If T is a singleton, or if case-folding in a unibyte + locale and T's members all case-fold to the same char, + convert T to one of its members. Otherwise, do + nothing further with T. */ + charclass *ccl = &d->charclasses[t - CSET]; + int j; + for (j = 0; j < NOTCHAR; j++) + if (tstbit (j, *ccl)) + break; + if (! (j < NOTCHAR)) + { + mp = allocmust (mp, 2); + break; + } + t = j; + while (++j < NOTCHAR) + if (tstbit (j, *ccl) + && ! (case_fold_unibyte + && toupper (j) == toupper (t))) + break; + if (j < NOTCHAR) + { + mp = allocmust (mp, 2); + break; + } + } + + size_t rj = ri + 2; + if (d->tokens[ri + 1] == CAT) + { + for (; rj < d->tindex - 1; rj += 2) + { + if ((rj != ri && (d->tokens[rj] <= 0 + || NOTCHAR <= d->tokens[rj])) + || d->tokens[rj + 1] != CAT) + break; + } + } + mp = allocmust (mp, ((rj - ri) >> 1) + 1); + mp->is[0] = mp->left[0] = mp->right[0] + = case_fold_unibyte ? toupper (t) : t; + + for (i = 1; ri + 2 < rj; i++) + { + ri += 2; + t = d->tokens[ri]; + mp->is[i] = mp->left[i] = mp->right[i] + = case_fold_unibyte ? toupper (t) : t; + } + mp->is[i] = mp->left[i] = mp->right[i] = '\0'; + mp->in = enlist (mp->in, mp->is, i); + break; + } + } + done:; + + struct dfamust *dm = NULL; + if (*result) + { + dm = xmalloc (sizeof *dm); + dm->exact = exact; + dm->begline = begline; + dm->endline = endline; + dm->must = xstrdup (result); + } + + while (mp) + { + must *prev = mp->prev; + freemust (mp); + mp = prev; + } + + return dm; +} + +void +dfamustfree (struct dfamust *dm) +{ + free (dm->must); + free (dm); +} + +struct dfa * +dfaalloc (void) +{ + return xmalloc (sizeof (struct dfa)); +} + +/* vim:set shiftwidth=2: */ diff --git a/src/dfa.h b/src/dfa.h new file mode 100644 index 0000000..fb9ac9a --- /dev/null +++ b/src/dfa.h @@ -0,0 +1,119 @@ +/* dfa.h - declarations for GNU deterministic regexp compiler + Copyright (C) 1988, 1998, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., + 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ + +/* Written June, 1988 by Mike Haertel */ + +#include +#include +#include + +#include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */ + +/* Element of a list of strings, at least one of which is known to + appear in any R.E. matching the DFA. */ +struct dfamust +{ + bool exact; + bool begline; + bool endline; + char *must; +}; + +/* The dfa structure. It is completely opaque. */ +struct dfa; + +/* Entry points. */ + +/* Allocate a struct dfa. The struct dfa is completely opaque. + The returned pointer should be passed directly to free() after + calling dfafree() on it. */ +extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; + +/* Build and return the struct dfamust from the given struct dfa. */ +extern struct dfamust *dfamust (struct dfa const *); + +/* Free the storage held by the components of a struct dfamust. */ +extern void dfamustfree (struct dfamust *); + +/* dfasyntax() takes three arguments; the first sets the syntax bits described + earlier in this file, the second sets the case-folding flag, and the + third specifies the line terminator. */ +extern void dfasyntax (reg_syntax_t, int, unsigned char); + +/* Compile the given string of the given length into the given struct dfa. + Final argument is a flag specifying whether to build a searching or an + exact matcher. */ +extern void dfacomp (char const *, size_t, struct dfa *, int); + +/* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the + buffer, and the shortest possible version thereof. Return a pointer to + the first character after the match, or NULL if none is found. BEGIN + points to the beginning of the buffer, and END points to the first byte + after its end. Note however that we store a sentinel byte (usually + newline) in *END, so the actual buffer must be one byte longer. + When NEWLINE is nonzero, newlines may appear in the matching string. + If COUNT is non-NULL, increment *COUNT once for each newline processed. + Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we + encountered a back-reference (1) or not (0). The caller may use this + to decide whether to fall back on a backtracking matcher. */ +extern char *dfaexec (struct dfa *d, char const *begin, char *end, + int newline, size_t *count, int *backref); + +/* Return a superset for D. The superset matches everything that D + matches, along with some other strings (though the latter should be + rare, for efficiency reasons). Return a null pointer if no useful + superset is available. */ +extern struct dfa *dfasuperset (struct dfa const *d) _GL_ATTRIBUTE_PURE; + +/* The DFA is likely to be fast. */ +extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE; + +/* Free the storage held by the components of a struct dfa. */ +extern void dfafree (struct dfa *); + +/* Entry points for people who know what they're doing. */ + +/* Initialize the components of a struct dfa. */ +extern void dfainit (struct dfa *); + +/* Incrementally parse a string of given length into a struct dfa. */ +extern void dfaparse (char const *, size_t, struct dfa *); + +/* Analyze a parsed regexp; second argument tells whether to build a searching + or an exact matcher. */ +extern void dfaanalyze (struct dfa *, int); + +/* Compute, for each possible character, the transitions out of a given + state, storing them in an array of integers. */ +extern void dfastate (ptrdiff_t, struct dfa *, ptrdiff_t []); + +/* Error handling. */ + +/* dfawarn() is called by the regexp routines whenever a regex is compiled + that likely doesn't do what the user wanted. It takes a single + argument, a NUL-terminated string describing the situation. The user + must supply a dfawarn. */ +extern void dfawarn (const char *); + +/* dfaerror() is called by the regexp routines whenever an error occurs. It + takes a single argument, a NUL-terminated string describing the error. + The user must supply a dfaerror. */ +extern _Noreturn void dfaerror (const char *); + +extern int using_utf8 (void); diff --git a/src/dfasearch.c b/src/dfasearch.c new file mode 100644 index 0000000..d348d44 --- /dev/null +++ b/src/dfasearch.c @@ -0,0 +1,451 @@ +/* dfasearch.c - searching subroutines using dfa and regex for grep. + Copyright 1992, 1998, 2000, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include +#include "intprops.h" +#include "search.h" + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +/* KWset compiled pattern. For Ecompile and Gcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ +static kwset_t kwset; + +/* DFA compiled regexp. */ +static struct dfa *dfa; + +/* The Regex compiled patterns. */ +static struct patterns +{ + /* Regex compiled regexp. */ + struct re_pattern_buffer regexbuf; + struct re_registers regs; /* This is here on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ +} patterns0; + +static struct patterns *patterns; +static size_t pcount; + +/* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to + call the regexp matcher at all. */ +static size_t kwset_exact_matches; + +static bool begline; + +void +dfaerror (char const *mesg) +{ + error (EXIT_TROUBLE, 0, "%s", mesg); + + /* notreached */ + /* Tell static analyzers that this function does not return. */ + abort (); +} + +/* For now, the sole dfawarn-eliciting condition (use of a regexp + like '[:lower:]') is unequivocally an error, so treat it as such, + when possible. */ +void +dfawarn (char const *mesg) +{ + static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode; + if (mode == DW_NONE) + mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU); + if (mode == DW_GNU) + dfaerror (mesg); +} + +/* If the DFA turns out to have some set of fixed strings one of + which must occur in the match, then we build a kwset matcher + to find those strings, and thus quickly filter out impossible + matches. */ +static void +kwsmusts (void) +{ + struct dfamust *dm = dfamust (dfa); + if (!dm) + return; + kwsinit (&kwset); + if (dm->exact) + { + /* Prepare a substring whose presence implies a match. + The kwset matcher will return the index of the matching + string that it chooses. */ + ++kwset_exact_matches; + size_t old_len = strlen (dm->must); + size_t new_len = old_len + dm->begline + dm->endline; + char *must = xmalloc (new_len); + char *mp = must; + *mp = eolbyte; + mp += dm->begline; + begline |= dm->begline; + memcpy (mp, dm->must, old_len); + if (dm->endline) + mp[old_len] = eolbyte; + kwsincr (kwset, must, new_len); + free (must); + } + else + { + /* Otherwise, filtering with this substring should help reduce the + search space, but we'll still have to use the regexp matcher. */ + kwsincr (kwset, dm->must, strlen (dm->must)); + } + kwsprep (kwset); + dfamustfree (dm); +} + +void +GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) +{ + size_t total = size; + char *motif; + + if (match_icase) + syntax_bits |= RE_ICASE; + re_set_syntax (syntax_bits); + dfasyntax (syntax_bits, match_icase, eolbyte); + + /* For GNU regex, pass the patterns separately to detect errors like + "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and + this should be a syntax error. The same for backref, where the + backref should be local to each pattern. */ + char const *p = pattern; + do + { + size_t len; + char const *sep = memchr (p, '\n', total); + if (sep) + { + len = sep - p; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns); + patterns[pcount] = patterns0; + + char const *err = re_compile_pattern (p, len, + &(patterns[pcount].regexbuf)); + if (err) + error (EXIT_TROUBLE, 0, "%s", err); + pcount++; + p = sep; + } + while (p); + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + static char const line_beg_no_bk[] = "^("; + static char const line_end_no_bk[] = ")$"; + static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; + static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; + static char const line_beg_bk[] = "^\\("; + static char const line_end_bk[] = "\\)$"; + static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; + static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; + int bk = !(syntax_bits & RE_NO_BK_PARENS); + char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); + + strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) + : (bk ? word_beg_bk : word_beg_no_bk)); + total = strlen(n); + memcpy (n + total, pattern, size); + total += size; + strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) + : (bk ? word_end_bk : word_end_no_bk)); + total += strlen (n + total); + pattern = motif = n; + size = total; + } + else + motif = NULL; + + dfa = dfaalloc (); + dfacomp (pattern, size, dfa, 1); + kwsmusts (); + + free(motif); +} + +size_t +EGexecute (char *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ + char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; + char eol = eolbyte; + regoff_t start; + size_t len, best_len; + struct kwsmatch kwsm; + size_t i; + struct dfa *superset = dfasuperset (dfa); + bool dfafast = dfaisfast (dfa); + + mb_start = buf; + buflim = buf + size; + + for (beg = end = buf; end < buflim; beg = end) + { + end = buflim; + + if (!start_ptr) + { + char const *next_beg, *dfa_beg = beg; + size_t count = 0; + bool exact_kwset_match = false; + int backref = 0; + + /* Try matching with KWset, if it's defined. */ + if (kwset) + { + char const *prev_beg; + + /* Find a possible match using the KWset matcher. */ + size_t offset = kwsexec (kwset, beg - begline, + buflim - beg + begline, &kwsm); + if (offset == (size_t) -1) + goto failure; + match = beg + offset; + prev_beg = beg; + + /* Narrow down to the line containing the possible match. */ + beg = memrchr (buf, eol, match - buf); + beg = beg ? beg + 1 : buf; + dfa_beg = beg; + + /* Determine the end pointer to give the DFA next. Typically + this is after the first newline after MATCH; but if the KWset + match is not exact, the DFA is fast, and the offset from + PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the + greater of the latter two values; this temporarily prefers + the DFA to KWset. */ + exact_kwset_match = kwsm.index < kwset_exact_matches; + end = ((exact_kwset_match || !dfafast + || MAX (16, match - beg) < (match - prev_beg) >> 2) + ? match + : MAX (16, match - beg) < (buflim - prev_beg) >> 2 + ? prev_beg + 4 * MAX (16, match - beg) + : buflim); + end = memchr (end, eol, buflim - end); + end = end ? end + 1 : buflim; + + if (exact_kwset_match) + { + if (MB_CUR_MAX == 1 || using_utf8 ()) + goto success; + if (mb_start < beg) + mb_start = beg; + if (mb_goback (&mb_start, match, buflim) == 0) + goto success; + /* The matched line starts in the middle of a multibyte + character. Perform the DFA search starting from the + beginning of the next character. */ + dfa_beg = mb_start; + } + } + + /* Try matching with the superset of DFA, if it's defined. */ + if (superset && !exact_kwset_match) + { + /* Keep using the superset while it reports multiline + potential matches; this is more likely to be fast + than falling back to KWset would be. */ + while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, + &count, NULL)) + && next_beg != end + && count != 0) + { + /* Try to match in just one line. */ + count = 0; + beg = memrchr (buf, eol, next_beg - buf); + beg++; + dfa_beg = beg; + } + if (next_beg == NULL || next_beg == end) + continue; + + /* Narrow down to the line we've found. */ + end = memchr (next_beg, eol, buflim - next_beg); + end = end ? end + 1 : buflim; + } + + /* Try matching with DFA. */ + next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref); + + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + + /* Narrow down to the line we've found. */ + if (count != 0) + { + beg = memrchr (buf, eol, next_beg - buf); + beg++; + } + end = memchr (next_beg, eol, buflim - next_beg); + end = end ? end + 1 : buflim; + + /* Successful, no backreferences encountered! */ + if (!backref) + goto success; + ptr = beg; + } + else + { + /* We are looking for the leftmost (then longest) exact match. + We will go through the outer loop only once. */ + ptr = start_ptr; + } + + /* If the "line" is longer than the maximum regexp offset, + die as if we've run out of memory. */ + if (TYPE_MAXIMUM (regoff_t) < end - beg - 1) + xalloc_die (); + + /* Run the possible match through Regex. */ + best_match = end; + best_len = 0; + for (i = 0; i < pcount; i++) + { + patterns[i].regexbuf.not_eol = 0; + patterns[i].regexbuf.newline_anchor = eolbyte == '\n'; + start = re_search (&(patterns[i].regexbuf), + beg, end - beg - 1, + ptr - beg, end - ptr - 1, + &(patterns[i].regs)); + if (start < -1) + xalloc_die (); + else if (0 <= start) + { + len = patterns[i].regs.end[0] - start; + match = beg + start; + if (match > best_match) + continue; + if (start_ptr && !match_words) + goto assess_pattern_match; + if ((!match_lines && !match_words) + || (match_lines && len == end - ptr - 1)) + { + match = ptr; + len = end - ptr; + goto assess_pattern_match; + } + /* If -w and not -x, check whether the match aligns with + word boundaries. Do this iteratively because: + (a) the line may contain more than one occurrence of the + pattern, and + (b) Several alternatives in the pattern might be valid at a + given point, and we may need to consider a shorter one to + find a word boundary. */ + if (!match_lines && match_words) + while (match <= best_match) + { + regoff_t shorter_len = 0; + if (!wordchar (mb_prev_wc (beg, match, end - 1)) + && !wordchar (mb_next_wc (match + len, end - 1))) + goto assess_pattern_match; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + patterns[i].regexbuf.not_eol = 1; + shorter_len = re_match (&(patterns[i].regexbuf), + beg, match + len - ptr, + match - beg, + &(patterns[i].regs)); + if (shorter_len < -1) + xalloc_die (); + } + if (0 < shorter_len) + len = shorter_len; + else + { + /* Try looking further on. */ + if (match == end - 1) + break; + match++; + patterns[i].regexbuf.not_eol = 0; + start = re_search (&(patterns[i].regexbuf), + beg, end - beg - 1, + match - beg, end - match - 1, + &(patterns[i].regs)); + if (start < 0) + { + if (start < -1) + xalloc_die (); + break; + } + len = patterns[i].regs.end[0] - start; + match = beg + start; + } + } /* while (match <= best_match) */ + continue; + assess_pattern_match: + if (!start_ptr) + { + /* Good enough for a non-exact match. + No need to look at further patterns, if any. */ + goto success; + } + if (match < best_match || (match == best_match && len > best_len)) + { + /* Best exact match: leftmost, then longest. */ + best_match = match; + best_len = len; + } + } /* if re_search >= 0 */ + } /* for Regex patterns. */ + if (best_match < end) + { + /* We have found an exact match. We were just + waiting for the best one (leftmost then longest). */ + beg = best_match; + len = best_len; + goto success_in_len; + } + } /* for (beg = end ..) */ + + failure: + return -1; + + success: + len = end - beg; + success_in_len:; + size_t off = beg - buf; + *match_size = len; + return off; +} diff --git a/src/dosbuf.c b/src/dosbuf.c new file mode 100644 index 0000000..839cc3a --- /dev/null +++ b/src/dosbuf.c @@ -0,0 +1,222 @@ +/* dosbuf.c + Copyright (C) 1992, 1997-2002, 2004-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Messy DOS-specific code for correctly treating binary, Unix text + and DOS text files. + + This has several aspects: + + * Guessing the file type (unless the user tells us); + * Stripping CR characters from DOS text files (otherwise regex + functions won't work correctly); + * Reporting correct byte count with -b for any kind of file. + +*/ + +#include + +typedef enum { + UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT +} File_type; + +struct dos_map { + off_t pos; /* position in buffer passed to matcher */ + off_t add; /* how much to add when reporting char position */ +}; + +static int dos_report_unix_offset = 0; + +static File_type dos_file_type = UNKNOWN; +static File_type dos_use_file_type = UNKNOWN; +static off_t dos_stripped_crs = 0; +static struct dos_map *dos_pos_map; +static int dos_pos_map_size = 0; +static int dos_pos_map_used = 0; +static int inp_map_idx = 0, out_map_idx = 1; + +/* Set default DOS file type to binary. */ +static void +dos_binary (void) +{ + if (O_BINARY) + dos_use_file_type = DOS_BINARY; +} + +/* Tell DOS routines to report Unix offset. */ +static void +dos_unix_byte_offsets (void) +{ + if (O_BINARY) + dos_report_unix_offset = 1; +} + +/* Guess DOS file type by looking at its contents. */ +static File_type +guess_type (char *buf, size_t buflen) +{ + int crlf_seen = 0; + char *bp = buf; + + while (buflen--) + { + /* Treat a file as binary if it has a NUL character. */ + if (!*bp) + return DOS_BINARY; + + /* CR before LF means DOS text file (unless we later see + binary characters). */ + else if (*bp == '\r' && buflen && bp[1] == '\n') + crlf_seen = 1; + + bp++; + } + + return crlf_seen ? DOS_TEXT : UNIX_TEXT; +} + +/* Convert external DOS file representation to internal. + Return the count of bytes left in the buffer. + Build table to map character positions when reporting byte counts. */ +static size_t +undossify_input (char *buf, size_t buflen) +{ + if (! O_BINARY) + return buflen; + + size_t bytes_left = 0; + + if (totalcc == 0) + { + /* New file: forget everything we knew about character + position mapping table and file type. */ + inp_map_idx = 0; + out_map_idx = 1; + dos_pos_map_used = 0; + dos_stripped_crs = 0; + dos_file_type = dos_use_file_type; + } + + /* Guess if this file is binary, unless we already know that. */ + if (dos_file_type == UNKNOWN) + dos_file_type = guess_type(buf, buflen); + + /* If this file is to be treated as DOS Text, strip the CR characters + and maybe build the table for character position mapping on output. */ + if (dos_file_type == DOS_TEXT) + { + char *destp = buf; + + while (buflen--) + { + if (*buf != '\r') + { + *destp++ = *buf++; + bytes_left++; + } + else + { + buf++; + if (out_byte && !dos_report_unix_offset) + { + dos_stripped_crs++; + while (buflen && *buf == '\r') + { + dos_stripped_crs++; + buflen--; + buf++; + } + if (inp_map_idx >= dos_pos_map_size - 1) + { + dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000; + dos_pos_map = xrealloc(dos_pos_map, + dos_pos_map_size * + sizeof(struct dos_map)); + } + + if (!inp_map_idx) + { + /* Add sentinel entry. */ + dos_pos_map[inp_map_idx].pos = 0; + dos_pos_map[inp_map_idx++].add = 0; + + /* Initialize first real entry. */ + dos_pos_map[inp_map_idx].add = 0; + } + + /* Put the new entry. If the stripped CR characters + precede a Newline (the usual case), pretend that + they were found *after* the Newline. This makes + displayed byte offsets more reasonable in some + cases, and fits better the intuitive notion that + the line ends *before* the CR, not *after* it. */ + inp_map_idx++; + dos_pos_map[inp_map_idx-1].pos = + (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc; + dos_pos_map[inp_map_idx].add = dos_stripped_crs; + dos_pos_map_used = inp_map_idx; + + /* The following will be updated on the next pass. */ + dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1; + } + } + } + + return bytes_left; + } + + return buflen; +} + +/* Convert internal byte count into external. */ +static off_t +dossified_pos (off_t byteno) +{ + if (! O_BINARY) + return byteno; + + off_t pos_lo; + off_t pos_hi; + + if (dos_file_type != DOS_TEXT || dos_report_unix_offset) + return byteno; + + /* Optimization: usually the file will be scanned sequentially. + So in most cases, this byte position will be found in the + table near the previous one, as recorded in 'out_map_idx'. */ + pos_lo = dos_pos_map[out_map_idx-1].pos; + pos_hi = dos_pos_map[out_map_idx].pos; + + /* If the initial guess failed, search up or down, as + appropriate, beginning with the previous place. */ + if (byteno >= pos_hi) + { + out_map_idx++; + while (out_map_idx < dos_pos_map_used + && byteno >= dos_pos_map[out_map_idx].pos) + out_map_idx++; + } + + else if (byteno < pos_lo) + { + out_map_idx--; + while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos) + out_map_idx--; + } + + return byteno + dos_pos_map[out_map_idx].add; +} diff --git a/src/egrep.sh b/src/egrep.sh new file mode 100644 index 0000000..6d6c15a --- /dev/null +++ b/src/egrep.sh @@ -0,0 +1,2 @@ +#!@SHELL@ +exec @grep@ @option@ "$@" diff --git a/src/grep.c b/src/grep.c new file mode 100644 index 0000000..8baca5a --- /dev/null +++ b/src/grep.c @@ -0,0 +1,2720 @@ +/* grep.c - main driver file for grep. + Copyright (C) 1992, 1997-2002, 2004-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written July 1992 by Mike Haertel. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "system.h" + +#include "argmatch.h" +#include "c-ctype.h" +#include "closeout.h" +#include "colorize.h" +#include "error.h" +#include "exclude.h" +#include "exitfail.h" +#include "fcntl-safer.h" +#include "fts_.h" +#include "getopt.h" +#include "grep.h" +#include "intprops.h" +#include "progname.h" +#include "propername.h" +#include "quote.h" +#include "safe-read.h" +#include "search.h" +#include "version-etc.h" +#include "xalloc.h" +#include "xstrtol.h" + +#define SEP_CHAR_SELECTED ':' +#define SEP_CHAR_REJECTED '-' +#define SEP_STR_GROUP "--" + +#define AUTHORS \ + proper_name ("Mike Haertel"), \ + _("others, see ") + +/* When stdout is connected to a regular file, save its stat + information here, so that we can automatically skip it, thus + avoiding a potential (racy) infinite loop. */ +static struct stat out_stat; + +/* if non-zero, display usage information and exit */ +static int show_help; + +/* Print the version on standard output and exit. */ +static bool show_version; + +/* Suppress diagnostics for nonexistent or unreadable files. */ +static bool suppress_errors; + +/* If nonzero, use color markers. */ +static int color_option; + +/* Show only the part of a line matching the expression. */ +static bool only_matching; + +/* If nonzero, make sure first content char in a line is on a tab stop. */ +static bool align_tabs; + +#if HAVE_ASAN +/* Record the starting address and length of the sole poisoned region, + so that we can unpoison it later, just before each following read. */ +static void const *poison_buf; +static size_t poison_len; + +static void +clear_asan_poison (void) +{ + if (poison_buf) + __asan_unpoison_memory_region (poison_buf, poison_len); +} + +static void +asan_poison (void const *addr, size_t size) +{ + poison_buf = addr; + poison_len = size; + + __asan_poison_memory_region (poison_buf, poison_len); +} +#else +static void clear_asan_poison (void) { } +static void asan_poison (void const volatile *addr, size_t size) { } +#endif + +/* The group separator used when context is requested. */ +static const char *group_separator = SEP_STR_GROUP; + +/* The context and logic for choosing default --color screen attributes + (foreground and background colors, etc.) are the following. + -- There are eight basic colors available, each with its own + nominal luminosity to the human eye and foreground/background + codes (black [0 %, 30/40], blue [11 %, 34/44], red [30 %, 31/41], + magenta [41 %, 35/45], green [59 %, 32/42], cyan [70 %, 36/46], + yellow [89 %, 33/43], and white [100 %, 37/47]). + -- Sometimes, white as a background is actually implemented using + a shade of light gray, so that a foreground white can be visible + on top of it (but most often not). + -- Sometimes, black as a foreground is actually implemented using + a shade of dark gray, so that it can be visible on top of a + background black (but most often not). + -- Sometimes, more colors are available, as extensions. + -- Other attributes can be selected/deselected (bold [1/22], + underline [4/24], standout/inverse [7/27], blink [5/25], and + invisible/hidden [8/28]). They are sometimes implemented by + using colors instead of what their names imply; e.g., bold is + often achieved by using brighter colors. In practice, only bold + is really available to us, underline sometimes being mapped by + the terminal to some strange color choice, and standout best + being left for use by downstream programs such as less(1). + -- We cannot assume that any of the extensions or special features + are available for the purpose of choosing defaults for everyone. + -- The most prevalent default terminal backgrounds are pure black + and pure white, and are not necessarily the same shades of + those as if they were selected explicitly with SGR sequences. + Some terminals use dark or light pictures as default background, + but those are covered over by an explicit selection of background + color with an SGR sequence; their users will appreciate their + background pictures not be covered like this, if possible. + -- Some uses of colors attributes is to make some output items + more understated (e.g., context lines); this cannot be achieved + by changing the background color. + -- For these reasons, the grep color defaults should strive not + to change the background color from its default, unless it's + for a short item that should be highlighted, not understated. + -- The grep foreground color defaults (without an explicitly set + background) should provide enough contrast to be readable on any + terminal with either a black (dark) or white (light) background. + This only leaves red, magenta, green, and cyan (and their bold + counterparts) and possibly bold blue. */ +/* The color strings used for matched text. + The user can overwrite them using the deprecated + environment variable GREP_COLOR or the new GREP_COLORS. */ +static const char *selected_match_color = "01;31"; /* bold red */ +static const char *context_match_color = "01;31"; /* bold red */ + +/* Other colors. Defaults look damn good. */ +static const char *filename_color = "35"; /* magenta */ +static const char *line_num_color = "32"; /* green */ +static const char *byte_num_color = "32"; /* green */ +static const char *sep_color = "36"; /* cyan */ +static const char *selected_line_color = ""; /* default color pair */ +static const char *context_line_color = ""; /* default color pair */ + +/* Select Graphic Rendition (SGR, "\33[...m") strings. */ +/* Also Erase in Line (EL) to Right ("\33[K") by default. */ +/* Why have EL to Right after SGR? + -- The behavior of line-wrapping when at the bottom of the + terminal screen and at the end of the current line is often + such that a new line is introduced, entirely cleared with + the current background color which may be different from the + default one (see the boolean back_color_erase terminfo(5) + capability), thus scrolling the display by one line. + The end of this new line will stay in this background color + even after reverting to the default background color with + "\33[m', unless it is explicitly cleared again with "\33[K" + (which is the behavior the user would instinctively expect + from the whole thing). There may be some unavoidable + background-color flicker at the end of this new line because + of this (when timing with the monitor's redraw is just right). + -- The behavior of HT (tab, "\t") is usually the same as that of + Cursor Forward Tabulation (CHT) with a default parameter + of 1 ("\33[I"), i.e., it performs pure movement to the next + tab stop, without any clearing of either content or screen + attributes (including background color); try + printf 'asdfqwerzxcv\rASDF\tZXCV\n' + in a bash(1) shell to demonstrate this. This is not what the + user would instinctively expect of HT (but is ok for CHT). + The instinctive behavior would include clearing the terminal + cells that are skipped over by HT with blank cells in the + current screen attributes, including background color; + the boolean dest_tabs_magic_smso terminfo(5) capability + indicates this saner behavior for HT, but only some rare + terminals have it (although it also indicates a special + glitch with standout mode in the Teleray terminal for which + it was initially introduced). The remedy is to add "\33K" + after each SGR sequence, be it START (to fix the behavior + of any HT after that before another SGR) or END (to fix the + behavior of an HT in default background color that would + follow a line-wrapping at the bottom of the screen in another + background color, and to complement doing it after START). + Piping grep's output through a pager such as less(1) avoids + any HT problems since the pager performs tab expansion. + + Generic disadvantages of this remedy are: + -- Some very rare terminals might support SGR but not EL (nobody + will use "grep --color" on a terminal that does not support + SGR in the first place). + -- Having these extra control sequences might somewhat complicate + the task of any program trying to parse "grep --color" + output in order to extract structuring information from it. + A specific disadvantage to doing it after SGR START is: + -- Even more possible background color flicker (when timing + with the monitor's redraw is just right), even when not at the + bottom of the screen. + There are no additional disadvantages specific to doing it after + SGR END. + + It would be impractical for GNU grep to become a full-fledged + terminal program linked against ncurses or the like, so it will + not detect terminfo(5) capabilities. */ +static const char *sgr_start = "\33[%sm\33[K"; +static const char *sgr_end = "\33[m\33[K"; + +/* SGR utility functions. */ +static void +pr_sgr_start (char const *s) +{ + if (*s) + print_start_colorize (sgr_start, s); +} +static void +pr_sgr_end (char const *s) +{ + if (*s) + print_end_colorize (sgr_end); +} +static void +pr_sgr_start_if (char const *s) +{ + if (color_option) + pr_sgr_start (s); +} +static void +pr_sgr_end_if (char const *s) +{ + if (color_option) + pr_sgr_end (s); +} + +struct color_cap + { + const char *name; + const char **var; + void (*fct) (void); + }; + +static void +color_cap_mt_fct (void) +{ + /* Our caller just set selected_match_color. */ + context_match_color = selected_match_color; +} + +static void +color_cap_rv_fct (void) +{ + /* By this point, it was 1 (or already -1). */ + color_option = -1; /* That's still != 0. */ +} + +static void +color_cap_ne_fct (void) +{ + sgr_start = "\33[%sm"; + sgr_end = "\33[m"; +} + +/* For GREP_COLORS. */ +static const struct color_cap color_dict[] = + { + { "mt", &selected_match_color, color_cap_mt_fct }, /* both ms/mc */ + { "ms", &selected_match_color, NULL }, /* selected matched text */ + { "mc", &context_match_color, NULL }, /* context matched text */ + { "fn", &filename_color, NULL }, /* filename */ + { "ln", &line_num_color, NULL }, /* line number */ + { "bn", &byte_num_color, NULL }, /* byte (sic) offset */ + { "se", &sep_color, NULL }, /* separator */ + { "sl", &selected_line_color, NULL }, /* selected lines */ + { "cx", &context_line_color, NULL }, /* context lines */ + { "rv", NULL, color_cap_rv_fct }, /* -v reverses sl/cx */ + { "ne", NULL, color_cap_ne_fct }, /* no EL on SGR_* */ + { NULL, NULL, NULL } + }; + +/* Saved errno value from failed output functions on stdout. */ +static int stdout_errno; + +static void +putchar_errno (int c) +{ + if (putchar (c) < 0) + stdout_errno = errno; +} + +static void +fputs_errno (char const *s) +{ + if (fputs (s, stdout) < 0) + stdout_errno = errno; +} + +static void _GL_ATTRIBUTE_FORMAT_PRINTF (1, 2) +printf_errno (char const *format, ...) +{ + va_list ap; + va_start (ap, format); + if (vfprintf (stdout, format, ap) < 0) + stdout_errno = errno; + va_end (ap); +} + +static void +fwrite_errno (void const *ptr, size_t size, size_t nmemb) +{ + if (fwrite (ptr, size, nmemb, stdout) != nmemb) + stdout_errno = errno; +} + +static void +fflush_errno (void) +{ + if (fflush (stdout) != 0) + stdout_errno = errno; +} + +static struct exclude *excluded_patterns[2]; +static struct exclude *excluded_directory_patterns[2]; +/* Short options. */ +static char const short_options[] = +"0123456789A:B:C:D:EFGHIPTUVX:abcd:e:f:hiLlm:noqRrsuvwxyZz"; + +/* Non-boolean long options that have no corresponding short equivalents. */ +enum +{ + BINARY_FILES_OPTION = CHAR_MAX + 1, + COLOR_OPTION, + EXCLUDE_DIRECTORY_OPTION, + EXCLUDE_OPTION, + EXCLUDE_FROM_OPTION, + GROUP_SEPARATOR_OPTION, + INCLUDE_OPTION, + LINE_BUFFERED_OPTION, + LABEL_OPTION +}; + +/* Long options equivalences. */ +static struct option const long_options[] = +{ + {"basic-regexp", no_argument, NULL, 'G'}, + {"extended-regexp", no_argument, NULL, 'E'}, + {"fixed-regexp", no_argument, NULL, 'F'}, + {"fixed-strings", no_argument, NULL, 'F'}, + {"perl-regexp", no_argument, NULL, 'P'}, + {"after-context", required_argument, NULL, 'A'}, + {"before-context", required_argument, NULL, 'B'}, + {"binary-files", required_argument, NULL, BINARY_FILES_OPTION}, + {"byte-offset", no_argument, NULL, 'b'}, + {"context", required_argument, NULL, 'C'}, + {"color", optional_argument, NULL, COLOR_OPTION}, + {"colour", optional_argument, NULL, COLOR_OPTION}, + {"count", no_argument, NULL, 'c'}, + {"devices", required_argument, NULL, 'D'}, + {"directories", required_argument, NULL, 'd'}, + {"exclude", required_argument, NULL, EXCLUDE_OPTION}, + {"exclude-from", required_argument, NULL, EXCLUDE_FROM_OPTION}, + {"exclude-dir", required_argument, NULL, EXCLUDE_DIRECTORY_OPTION}, + {"file", required_argument, NULL, 'f'}, + {"files-with-matches", no_argument, NULL, 'l'}, + {"files-without-match", no_argument, NULL, 'L'}, + {"group-separator", required_argument, NULL, GROUP_SEPARATOR_OPTION}, + {"help", no_argument, &show_help, 1}, + {"include", required_argument, NULL, INCLUDE_OPTION}, + {"ignore-case", no_argument, NULL, 'i'}, + {"initial-tab", no_argument, NULL, 'T'}, + {"label", required_argument, NULL, LABEL_OPTION}, + {"line-buffered", no_argument, NULL, LINE_BUFFERED_OPTION}, + {"line-number", no_argument, NULL, 'n'}, + {"line-regexp", no_argument, NULL, 'x'}, + {"max-count", required_argument, NULL, 'm'}, + + {"no-filename", no_argument, NULL, 'h'}, + {"no-group-separator", no_argument, NULL, GROUP_SEPARATOR_OPTION}, + {"no-messages", no_argument, NULL, 's'}, + {"null", no_argument, NULL, 'Z'}, + {"null-data", no_argument, NULL, 'z'}, + {"only-matching", no_argument, NULL, 'o'}, + {"quiet", no_argument, NULL, 'q'}, + {"recursive", no_argument, NULL, 'r'}, + {"dereference-recursive", no_argument, NULL, 'R'}, + {"regexp", required_argument, NULL, 'e'}, + {"invert-match", no_argument, NULL, 'v'}, + {"silent", no_argument, NULL, 'q'}, + {"text", no_argument, NULL, 'a'}, + {"binary", no_argument, NULL, 'U'}, + {"unix-byte-offsets", no_argument, NULL, 'u'}, + {"version", no_argument, NULL, 'V'}, + {"with-filename", no_argument, NULL, 'H'}, + {"word-regexp", no_argument, NULL, 'w'}, + {0, 0, 0, 0} +}; + +/* Define flags declared in grep.h. */ +bool match_icase; +bool match_words; +bool match_lines; +char eolbyte; + +static char const *matcher; + +/* For error messages. */ +/* The input file name, or (if standard input) "-" or a --label argument. */ +static char const *filename; +/* Omit leading "./" from file names in diagnostics. */ +static bool omit_dot_slash; +static bool errseen; + +/* True if output from the current input file has been suppressed + because an output line had an encoding error. */ +static bool encoding_error_output; + +enum directories_type + { + READ_DIRECTORIES = 2, + RECURSE_DIRECTORIES, + SKIP_DIRECTORIES + }; + +/* How to handle directories. */ +static char const *const directories_args[] = +{ + "read", "recurse", "skip", NULL +}; +static enum directories_type const directories_types[] = +{ + READ_DIRECTORIES, RECURSE_DIRECTORIES, SKIP_DIRECTORIES +}; +ARGMATCH_VERIFY (directories_args, directories_types); + +static enum directories_type directories = READ_DIRECTORIES; + +enum { basic_fts_options = FTS_CWDFD | FTS_NOSTAT | FTS_TIGHT_CYCLE_CHECK }; +static int fts_options = basic_fts_options | FTS_COMFOLLOW | FTS_PHYSICAL; + +/* How to handle devices. */ +static enum + { + READ_COMMAND_LINE_DEVICES, + READ_DEVICES, + SKIP_DEVICES + } devices = READ_COMMAND_LINE_DEVICES; + +static bool grepfile (int, char const *, bool, bool); +static bool grepdesc (int, bool); + +static void dos_binary (void); +static void dos_unix_byte_offsets (void); +static size_t undossify_input (char *, size_t); + +static bool +is_device_mode (mode_t m) +{ + return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m); +} + +static bool +skip_devices (bool command_line) +{ + return (devices == SKIP_DEVICES + || (devices == READ_COMMAND_LINE_DEVICES && !command_line)); +} + +/* Return if ST->st_size is defined. Assume the file is not a + symbolic link. */ +static bool +usable_st_size (struct stat const *st) +{ + return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st); +} + +/* Lame substitutes for SEEK_DATA and SEEK_HOLE on platforms lacking them. + Do not rely on these finding data or holes if they equal SEEK_SET. */ +#ifndef SEEK_DATA +enum { SEEK_DATA = SEEK_SET }; +#endif +#ifndef SEEK_HOLE +enum { SEEK_HOLE = SEEK_SET }; +#endif + +/* Functions we'll use to search. */ +typedef void (*compile_fp_t) (char const *, size_t); +typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *); +static compile_fp_t compile; +static execute_fp_t execute; + +/* Like error, but suppress the diagnostic if requested. */ +static void +suppressible_error (char const *mesg, int errnum) +{ + if (! suppress_errors) + error (0, errnum, "%s", mesg); + errseen = true; +} + +/* If there has already been a write error, don't bother closing + standard output, as that might elicit a duplicate diagnostic. */ +static void +clean_up_stdout (void) +{ + if (! stdout_errno) + close_stdout (); +} + +/* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL + is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer + the alignment and would otherwise complain about the cast. */ +#if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) +# define CAST_ALIGNED(type, val) \ + ({ __typeof__ (val) val_ = val; \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wcast-align\"") \ + (type) val_; \ + _Pragma ("GCC diagnostic pop") \ + }) +#else +# define CAST_ALIGNED(type, val) ((type) (val)) +#endif + +/* An unsigned type suitable for fast matching. */ +typedef uintmax_t uword; + +/* A mask to test for unibyte characters, with the pattern repeated to + fill a uword. For a multibyte character encoding where + all bytes are unibyte characters, this is 0. For UTF-8, this is + 0x808080.... For encodings where unibyte characters have no discerned + pattern, this is all 1s. The unsigned char C is a unibyte + character if C & UNIBYTE_MASK is zero. If the uword W is the + concatenation of bytes, the bytes are all unibyte characters + if W & UNIBYTE_MASK is zero. */ +static uword unibyte_mask; + +static void +initialize_unibyte_mask (void) +{ + /* For each encoding error I that MASK does not already match, + accumulate I's most significant 1 bit by ORing it into MASK. + Although any 1 bit of I could be used, in practice high-order + bits work better. */ + unsigned char mask = 0; + int ms1b = 1; + for (int i = 1; i <= UCHAR_MAX; i++) + if (mbclen_cache[i] != 1 && ! (mask & i)) + { + while (ms1b * 2 <= i) + ms1b *= 2; + mask |= ms1b; + } + + /* Now MASK will detect any encoding-error byte, although it may + cry wolf and it may not be optimal. Build a uword-length mask by + repeating MASK. */ + uword uword_max = -1; + unibyte_mask = uword_max / UCHAR_MAX * mask; +} + +/* Skip the easy bytes in a buffer that is guaranteed to have a sentinel + that is not easy, and return a pointer to the first non-easy byte. + The easy bytes all have UNIBYTE_MASK off. */ +static char const * _GL_ATTRIBUTE_PURE +skip_easy_bytes (char const *buf) +{ + /* Search a byte at a time until the pointer is aligned, then a + uword at a time until a match is found, then a byte at a time to + identify the exact byte. The uword search may go slightly past + the buffer end, but that's benign. */ + char const *p; + uword const *s; + for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++) + if (to_uchar (*p) & unibyte_mask) + return p; + for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++) + continue; + for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++) + continue; + return p; +} + +/* Return true if BUF, of size SIZE, has an encoding error. + BUF must be followed by at least sizeof (uword) bytes, + the first of which may be modified. */ +bool +buf_has_encoding_errors (char *buf, size_t size) +{ + if (! unibyte_mask) + return false; + + mbstate_t mbs = { 0 }; + size_t clen; + + buf[size] = -1; + for (char const *p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen) + { + clen = mbrlen (p, buf + size - p, &mbs); + if ((size_t) -2 <= clen) + return true; + } + + return false; +} + + +/* Return true if BUF, of size SIZE, has a null byte. + BUF must be followed by at least one byte, + which may be arbitrarily written to or read from. */ +static bool +buf_has_nulls (char *buf, size_t size) +{ + buf[size] = 0; + return strlen (buf) != size; +} + +/* Return true if a file is known to contain null bytes. + SIZE bytes have already been read from the file + with descriptor FD and status ST. */ +static bool +file_must_have_nulls (size_t size, int fd, struct stat const *st) +{ + if (usable_st_size (st)) + { + if (st->st_size <= size) + return false; + + /* If the file has holes, it must contain a null byte somewhere. */ + if (SEEK_HOLE != SEEK_SET) + { + off_t cur = size; + if (O_BINARY || fd == STDIN_FILENO) + { + cur = lseek (fd, 0, SEEK_CUR); + if (cur < 0) + return false; + } + + /* Look for a hole after the current location. */ + off_t hole_start = lseek (fd, cur, SEEK_HOLE); + if (0 <= hole_start) + { + if (lseek (fd, cur, SEEK_SET) < 0) + suppressible_error (filename, errno); + if (hole_start < st->st_size) + return true; + } + } + } + + return false; +} + +/* Convert STR to a nonnegative integer, storing the result in *OUT. + STR must be a valid context length argument; report an error if it + isn't. Silently ceiling *OUT at the maximum value, as that is + practically equivalent to infinity for grep's purposes. */ +static void +context_length_arg (char const *str, intmax_t *out) +{ + switch (xstrtoimax (str, 0, 10, out, "")) + { + case LONGINT_OK: + case LONGINT_OVERFLOW: + if (0 <= *out) + break; + /* Fall through. */ + default: + error (EXIT_TROUBLE, 0, "%s: %s", str, + _("invalid context length argument")); + } +} + +/* Return the add_exclude options suitable for excluding a file name. + If COMMAND_LINE, it is a command-line file name. */ +static int +exclude_options (bool command_line) +{ + return EXCLUDE_WILDCARDS | (command_line ? 0 : EXCLUDE_ANCHORED); +} + +/* Return true if the file with NAME should be skipped. + If COMMAND_LINE, it is a command-line argument. + If IS_DIR, it is a directory. */ +static bool +skipped_file (char const *name, bool command_line, bool is_dir) +{ + struct exclude **pats; + if (! is_dir) + pats = excluded_patterns; + else if (directories == SKIP_DIRECTORIES) + return true; + else if (command_line && omit_dot_slash) + return false; + else + pats = excluded_directory_patterns; + return pats[command_line] && excluded_file_name (pats[command_line], name); +} + +/* Hairy buffering mechanism for grep. The intent is to keep + all reads aligned on a page boundary and multiples of the + page size, unless a read yields a partial page. */ + +static char *buffer; /* Base of buffer. */ +static size_t bufalloc; /* Allocated buffer size, counting slop. */ +#define INITIAL_BUFSIZE 32768 /* Initial buffer size, not counting slop. */ +static int bufdesc; /* File descriptor. */ +static char *bufbeg; /* Beginning of user-visible stuff. */ +static char *buflim; /* Limit of user-visible stuff. */ +static size_t pagesize; /* alignment of memory pages */ +static off_t bufoffset; /* Read offset; defined on regular files. */ +static off_t after_last_match; /* Pointer after last matching line that + would have been output if we were + outputting characters. */ +static bool skip_nuls; /* Skip '\0' in data. */ +static bool skip_empty_lines; /* Skip empty lines in data. */ +static bool seek_data_failed; /* lseek with SEEK_DATA failed. */ +static uintmax_t totalnl; /* Total newline count before lastnl. */ + +/* Return VAL aligned to the next multiple of ALIGNMENT. VAL can be + an integer or a pointer. Both args must be free of side effects. */ +#define ALIGN_TO(val, alignment) \ + ((size_t) (val) % (alignment) == 0 \ + ? (val) \ + : (val) + ((alignment) - (size_t) (val) % (alignment))) + +/* Add two numbers that count input bytes or lines, and report an + error if the addition overflows. */ +static uintmax_t +add_count (uintmax_t a, uintmax_t b) +{ + uintmax_t sum = a + b; + if (sum < a) + error (EXIT_TROUBLE, 0, _("input is too large to count")); + return sum; +} + +/* Return true if BUF (of size SIZE) is all zeros. */ +static bool +all_zeros (char const *buf, size_t size) +{ + for (char const *p = buf; p < buf + size; p++) + if (*p) + return false; + return true; +} + +/* Reset the buffer for a new file, returning false if we should skip it. + Initialize on the first time through. */ +static bool +reset (int fd, struct stat const *st) +{ + if (! pagesize) + { + pagesize = getpagesize (); + if (pagesize == 0 || 2 * pagesize + 1 <= pagesize) + abort (); + bufalloc = (ALIGN_TO (INITIAL_BUFSIZE, pagesize) + + pagesize + sizeof (uword)); + buffer = xmalloc (bufalloc); + } + + bufbeg = buflim = ALIGN_TO (buffer + 1, pagesize); + bufbeg[-1] = eolbyte; + bufdesc = fd; + + if (S_ISREG (st->st_mode)) + { + if (fd != STDIN_FILENO) + bufoffset = 0; + else + { + bufoffset = lseek (fd, 0, SEEK_CUR); + if (bufoffset < 0) + { + suppressible_error (_("lseek failed"), errno); + return false; + } + } + } + return true; +} + +/* Read new stuff into the buffer, saving the specified + amount of old stuff. When we're done, 'bufbeg' points + to the beginning of the buffer contents, and 'buflim' + points just after the end. Return false if there's an error. */ +static bool +fillbuf (size_t save, struct stat const *st) +{ + size_t fillsize; + bool cc = true; + char *readbuf; + size_t readsize; + + /* Offset from start of buffer to start of old stuff + that we want to save. */ + size_t saved_offset = buflim - save - buffer; + + if (pagesize <= buffer + bufalloc - sizeof (uword) - buflim) + { + readbuf = buflim; + bufbeg = buflim - save; + } + else + { + size_t minsize = save + pagesize; + size_t newsize; + size_t newalloc; + char *newbuf; + + /* Grow newsize until it is at least as great as minsize. */ + for (newsize = bufalloc - pagesize - sizeof (uword); + newsize < minsize; + newsize *= 2) + if ((SIZE_MAX - pagesize - sizeof (uword)) / 2 < newsize) + xalloc_die (); + + /* Try not to allocate more memory than the file size indicates, + as that might cause unnecessary memory exhaustion if the file + is large. However, do not use the original file size as a + heuristic if we've already read past the file end, as most + likely the file is growing. */ + if (usable_st_size (st)) + { + off_t to_be_read = st->st_size - bufoffset; + off_t maxsize_off = save + to_be_read; + if (0 <= to_be_read && to_be_read <= maxsize_off + && maxsize_off == (size_t) maxsize_off + && minsize <= (size_t) maxsize_off + && (size_t) maxsize_off < newsize) + newsize = maxsize_off; + } + + /* Add enough room so that the buffer is aligned and has room + for byte sentinels fore and aft, and so that a uword can + be read aft. */ + newalloc = newsize + pagesize + sizeof (uword); + + newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer; + readbuf = ALIGN_TO (newbuf + 1 + save, pagesize); + bufbeg = readbuf - save; + memmove (bufbeg, buffer + saved_offset, save); + bufbeg[-1] = eolbyte; + if (newbuf != buffer) + { + free (buffer); + buffer = newbuf; + } + } + + clear_asan_poison (); + + readsize = buffer + bufalloc - sizeof (uword) - readbuf; + readsize -= readsize % pagesize; + + while (true) + { + fillsize = safe_read (bufdesc, readbuf, readsize); + if (fillsize == SAFE_READ_ERROR) + { + fillsize = 0; + cc = false; + } + bufoffset += fillsize; + + if (fillsize == 0 || !skip_nuls || !all_zeros (readbuf, fillsize)) + break; + totalnl = add_count (totalnl, fillsize); + + if (SEEK_DATA != SEEK_SET && !seek_data_failed) + { + /* Solaris SEEK_DATA fails with errno == ENXIO in a hole at EOF. */ + off_t data_start = lseek (bufdesc, bufoffset, SEEK_DATA); + if (data_start < 0 && errno == ENXIO + && usable_st_size (st) && bufoffset < st->st_size) + data_start = lseek (bufdesc, 0, SEEK_END); + + if (data_start < 0) + seek_data_failed = true; + else + { + totalnl = add_count (totalnl, data_start - bufoffset); + bufoffset = data_start; + } + } + } + + fillsize = undossify_input (readbuf, fillsize); + buflim = readbuf + fillsize; + + /* Initialize the following word, because skip_easy_bytes and some + matchers read (but do not use) those bytes. This avoids false + positive reports of these bytes being used uninitialized. */ + memset (buflim, 0, sizeof (uword)); + + /* Mark the part of the buffer not filled by the read or set by + the above memset call as ASAN-poisoned. */ + asan_poison (buflim + sizeof (uword), + bufalloc - (buflim - buffer) - sizeof (uword)); + + return cc; +} + +/* Flags controlling the style of output. */ +static enum +{ + BINARY_BINARY_FILES, + TEXT_BINARY_FILES, + WITHOUT_MATCH_BINARY_FILES +} binary_files; /* How to handle binary files. */ + +static int filename_mask; /* If zero, output nulls after filenames. */ +static bool out_quiet; /* Suppress all normal output. */ +static bool out_invert; /* Print nonmatching stuff. */ +static int out_file; /* Print filenames. */ +static bool out_line; /* Print line numbers. */ +static bool out_byte; /* Print byte offsets. */ +static intmax_t out_before; /* Lines of leading context. */ +static intmax_t out_after; /* Lines of trailing context. */ +static bool count_matches; /* Count matching lines. */ +static int list_files; /* List matching files. */ +static bool no_filenames; /* Suppress file names. */ +static intmax_t max_count; /* Stop after outputting this many + lines from an input file. */ +static bool line_buffered; /* Use line buffering. */ +static char *label = NULL; /* Fake filename for stdin */ + + +/* Internal variables to keep track of byte count, context, etc. */ +static uintmax_t totalcc; /* Total character count before bufbeg. */ +static char const *lastnl; /* Pointer after last newline counted. */ +static char *lastout; /* Pointer after last character output; + NULL if no character has been output + or if it's conceptually before bufbeg. */ +static intmax_t outleft; /* Maximum number of lines to be output. */ +static intmax_t pending; /* Pending lines of output. + Always kept 0 if out_quiet is true. */ +static bool done_on_match; /* Stop scanning file on first match. */ +static bool exit_on_match; /* Exit on first match. */ + +#include "dosbuf.c" + +static void +nlscan (char const *lim) +{ + size_t newlines = 0; + char const *beg; + for (beg = lastnl; beg < lim; beg++) + { + beg = memchr (beg, eolbyte, lim - beg); + if (!beg) + break; + newlines++; + } + totalnl = add_count (totalnl, newlines); + lastnl = lim; +} + +/* Print the current filename. */ +static void +print_filename (void) +{ + pr_sgr_start_if (filename_color); + fputs_errno (filename); + pr_sgr_end_if (filename_color); +} + +/* Print a character separator. */ +static void +print_sep (char sep) +{ + pr_sgr_start_if (sep_color); + putchar_errno (sep); + pr_sgr_end_if (sep_color); +} + +/* Print a line number or a byte offset. */ +static void +print_offset (uintmax_t pos, int min_width, const char *color) +{ + /* Do not rely on printf to print pos, since uintmax_t may be longer + than long, and long long is not portable. */ + + char buf[sizeof pos * CHAR_BIT]; + char *p = buf + sizeof buf; + + do + { + *--p = '0' + pos % 10; + --min_width; + } + while ((pos /= 10) != 0); + + /* Do this to maximize the probability of alignment across lines. */ + if (align_tabs) + while (--min_width >= 0) + *--p = ' '; + + pr_sgr_start_if (color); + fwrite_errno (p, 1, buf + sizeof buf - p); + pr_sgr_end_if (color); +} + +/* Print a whole line head (filename, line, byte). The output data + starts at BEG and contains LEN bytes; it is followed by at least + sizeof (uword) bytes, the first of which may be temporarily modified. + The output data comes from what is perhaps a larger input line that + goes until LIM, where LIM[-1] is an end-of-line byte. Use SEP as + the separator on output. + + Return true unless the line was suppressed due to an encoding error. */ + +static bool +print_line_head (char *beg, size_t len, char const *lim, char sep) +{ + bool encoding_errors = false; + if (binary_files != TEXT_BINARY_FILES) + { + char ch = beg[len]; + encoding_errors = buf_has_encoding_errors (beg, len); + beg[len] = ch; + } + if (encoding_errors) + { + encoding_error_output = done_on_match = out_quiet = true; + return false; + } + + bool pending_sep = false; + + if (out_file) + { + print_filename (); + if (filename_mask) + pending_sep = true; + else + putchar_errno (0); + } + + if (out_line) + { + if (lastnl < lim) + { + nlscan (beg); + totalnl = add_count (totalnl, 1); + lastnl = lim; + } + if (pending_sep) + print_sep (sep); + print_offset (totalnl, 4, line_num_color); + pending_sep = true; + } + + if (out_byte) + { + uintmax_t pos = add_count (totalcc, beg - bufbeg); + pos = dossified_pos (pos); + if (pending_sep) + print_sep (sep); + print_offset (pos, 6, byte_num_color); + pending_sep = true; + } + + if (pending_sep) + { + /* This assumes sep is one column wide. + Try doing this any other way with Unicode + (and its combining and wide characters) + filenames and you're wasting your efforts. */ + if (align_tabs) + fputs_errno ("\t\b"); + + print_sep (sep); + } + + return true; +} + +static char * +print_line_middle (char *beg, char *lim, + const char *line_color, const char *match_color) +{ + size_t match_size; + size_t match_offset; + char *cur; + char *mid = NULL; + char *b; + + for (cur = beg; + (cur < lim + && ((match_offset = execute (beg, lim - beg, &match_size, cur)) + != (size_t) -1)); + cur = b + match_size) + { + b = beg + match_offset; + + /* Avoid matching the empty line at the end of the buffer. */ + if (b == lim) + break; + + /* Avoid hanging on grep --color "" foo */ + if (match_size == 0) + { + /* Make minimal progress; there may be further non-empty matches. */ + /* XXX - Could really advance by one whole multi-octet character. */ + match_size = 1; + if (!mid) + mid = cur; + } + else + { + /* This function is called on a matching line only, + but is it selected or rejected/context? */ + if (only_matching) + { + char sep = out_invert ? SEP_CHAR_REJECTED : SEP_CHAR_SELECTED; + if (! print_line_head (b, match_size, lim, sep)) + return NULL; + } + else + { + pr_sgr_start (line_color); + if (mid) + { + cur = mid; + mid = NULL; + } + fwrite_errno (cur, 1, b - cur); + } + + pr_sgr_start_if (match_color); + fwrite_errno (b, 1, match_size); + pr_sgr_end_if (match_color); + if (only_matching) + putchar_errno (eolbyte); + } + } + + if (only_matching) + cur = lim; + else if (mid) + cur = mid; + + return cur; +} + +static char * +print_line_tail (char *beg, const char *lim, const char *line_color) +{ + size_t eol_size; + size_t tail_size; + + eol_size = (lim > beg && lim[-1] == eolbyte); + eol_size += (lim - eol_size > beg && lim[-(1 + eol_size)] == '\r'); + tail_size = lim - eol_size - beg; + + if (tail_size > 0) + { + pr_sgr_start (line_color); + fwrite_errno (beg, 1, tail_size); + beg += tail_size; + pr_sgr_end (line_color); + } + + return beg; +} + +static void +prline (char *beg, char *lim, char sep) +{ + bool matching; + const char *line_color; + const char *match_color; + + if (!only_matching) + if (! print_line_head (beg, lim - beg - 1, lim, sep)) + return; + + matching = (sep == SEP_CHAR_SELECTED) ^ out_invert; + + if (color_option) + { + line_color = (((sep == SEP_CHAR_SELECTED) + ^ (out_invert && (color_option < 0))) + ? selected_line_color : context_line_color); + match_color = (sep == SEP_CHAR_SELECTED + ? selected_match_color : context_match_color); + } + else + line_color = match_color = NULL; /* Shouldn't be used. */ + + if ((only_matching && matching) + || (color_option && (*line_color || *match_color))) + { + /* We already know that non-matching lines have no match (to colorize). */ + if (matching && (only_matching || *match_color)) + { + beg = print_line_middle (beg, lim, line_color, match_color); + if (! beg) + return; + } + + if (!only_matching && *line_color) + { + /* This code is exercised at least when grep is invoked like this: + echo k| GREP_COLORS='sl=01;32' src/grep k --color=always */ + beg = print_line_tail (beg, lim, line_color); + } + } + + if (!only_matching && lim > beg) + fwrite_errno (beg, 1, lim - beg); + + if (line_buffered) + fflush_errno (); + + if (stdout_errno) + error (EXIT_TROUBLE, stdout_errno, _("write error")); + + lastout = lim; +} + +/* Print pending lines of trailing context prior to LIM. Trailing context ends + at the next matching line when OUTLEFT is 0. */ +static void +prpending (char const *lim) +{ + if (!lastout) + lastout = bufbeg; + while (pending > 0 && lastout < lim) + { + char *nl = memchr (lastout, eolbyte, lim - lastout); + size_t match_size; + --pending; + if (outleft + || ((execute (lastout, nl + 1 - lastout, + &match_size, NULL) == (size_t) -1) + == !out_invert)) + prline (lastout, nl + 1, SEP_CHAR_REJECTED); + else + pending = 0; + } +} + +/* Output the lines between BEG and LIM. Deal with context. */ +static void +prtext (char *beg, char *lim) +{ + static bool used; /* Avoid printing SEP_STR_GROUP before any output. */ + char eol = eolbyte; + + if (!out_quiet && pending > 0) + prpending (beg); + + char *p = beg; + + if (!out_quiet) + { + /* Deal with leading context. */ + char const *bp = lastout ? lastout : bufbeg; + intmax_t i; + for (i = 0; i < out_before; ++i) + if (p > bp) + do + --p; + while (p[-1] != eol); + + /* Print the group separator unless the output is adjacent to + the previous output in the file. */ + if ((0 <= out_before || 0 <= out_after) && used + && p != lastout && group_separator) + { + pr_sgr_start_if (sep_color); + fputs_errno (group_separator); + pr_sgr_end_if (sep_color); + putchar_errno ('\n'); + } + + while (p < beg) + { + char *nl = memchr (p, eol, beg - p); + nl++; + prline (p, nl, SEP_CHAR_REJECTED); + p = nl; + } + } + + intmax_t n; + if (out_invert) + { + /* One or more lines are output. */ + for (n = 0; p < lim && n < outleft; n++) + { + char *nl = memchr (p, eol, lim - p); + nl++; + if (!out_quiet) + prline (p, nl, SEP_CHAR_SELECTED); + p = nl; + } + } + else + { + /* Just one line is output. */ + if (!out_quiet) + prline (beg, lim, SEP_CHAR_SELECTED); + n = 1; + p = lim; + } + + after_last_match = bufoffset - (buflim - p); + pending = out_quiet ? 0 : MAX (0, out_after); + used = true; + outleft -= n; +} + +/* Replace all NUL bytes in buffer P (which ends at LIM) with EOL. + This avoids running out of memory when binary input contains a long + sequence of zeros, which would otherwise be considered to be part + of a long line. P[LIM] should be EOL. */ +static void +zap_nuls (char *p, char *lim, char eol) +{ + if (eol) + while (true) + { + *lim = '\0'; + p += strlen (p); + *lim = eol; + if (p == lim) + break; + do + *p++ = eol; + while (!*p); + } +} + +/* Scan the specified portion of the buffer, matching lines (or + between matching lines if OUT_INVERT is true). Return a count of + lines printed. Replace all NUL bytes with NUL_ZAPPER as we go. */ +static intmax_t +grepbuf (char *beg, char const *lim) +{ + intmax_t outleft0 = outleft; + char *endp; + + for (char *p = beg; p < lim; p = endp) + { + size_t match_size; + size_t match_offset = execute (p, lim - p, &match_size, NULL); + if (match_offset == (size_t) -1) + { + if (!out_invert) + break; + match_offset = lim - p; + match_size = 0; + } + char *b = p + match_offset; + endp = b + match_size; + /* Avoid matching the empty line at the end of the buffer. */ + if (!out_invert && b == lim) + break; + if (!out_invert || p < b) + { + char *prbeg = out_invert ? p : b; + char *prend = out_invert ? b : endp; + prtext (prbeg, prend); + if (!outleft || done_on_match) + { + if (exit_on_match) + exit (EXIT_SUCCESS); + break; + } + } + } + + return outleft0 - outleft; +} + +/* Search a given file. Normally, return a count of lines printed; + but if the file is a directory and we search it recursively, then + return -2 if there was a match, and -1 otherwise. */ +static intmax_t +grep (int fd, struct stat const *st) +{ + intmax_t nlines, i; + size_t residue, save; + char oldc; + char *beg; + char *lim; + char eol = eolbyte; + char nul_zapper = '\0'; + bool done_on_match_0 = done_on_match; + bool out_quiet_0 = out_quiet; + + /* The value of NLINES when nulls were first deduced in the input; + this is not necessarily the same as the number of matching lines + before the first null. -1 if no input nulls have been deduced. */ + intmax_t nlines_first_null = -1; + + if (! reset (fd, st)) + return 0; + + totalcc = 0; + lastout = 0; + totalnl = 0; + outleft = max_count; + after_last_match = 0; + pending = 0; + skip_nuls = skip_empty_lines && !eol; + encoding_error_output = false; + seek_data_failed = false; + + nlines = 0; + residue = 0; + save = 0; + + if (! fillbuf (save, st)) + { + suppressible_error (filename, errno); + return 0; + } + + for (bool firsttime = true; ; firsttime = false) + { + if (nlines_first_null < 0 && eol && binary_files != TEXT_BINARY_FILES + && (buf_has_nulls (bufbeg, buflim - bufbeg) + || (firsttime && file_must_have_nulls (buflim - bufbeg, fd, st)))) + { + if (binary_files == WITHOUT_MATCH_BINARY_FILES) + return 0; + if (!count_matches) + done_on_match = out_quiet = true; + nlines_first_null = nlines; + nul_zapper = eol; + skip_nuls = skip_empty_lines; + } + + lastnl = bufbeg; + if (lastout) + lastout = bufbeg; + + beg = bufbeg + save; + + /* no more data to scan (eof) except for maybe a residue -> break */ + if (beg == buflim) + break; + + zap_nuls (beg, buflim, nul_zapper); + + /* Determine new residue (the length of an incomplete line at the end of + the buffer, 0 means there is no incomplete last line). */ + oldc = beg[-1]; + beg[-1] = eol; + /* FIXME: use rawmemrchr if/when it exists, since we have ensured + that this use of memrchr is guaranteed never to return NULL. */ + lim = memrchr (beg - 1, eol, buflim - beg + 1); + ++lim; + beg[-1] = oldc; + if (lim == beg) + lim = beg - residue; + beg -= residue; + residue = buflim - lim; + + if (beg < lim) + { + if (outleft) + nlines += grepbuf (beg, lim); + if (pending) + prpending (lim); + if ((!outleft && !pending) + || (done_on_match && MAX (0, nlines_first_null) < nlines)) + goto finish_grep; + } + + /* The last OUT_BEFORE lines at the end of the buffer will be needed as + leading context if there is a matching line at the begin of the + next data. Make beg point to their begin. */ + i = 0; + beg = lim; + while (i < out_before && beg > bufbeg && beg != lastout) + { + ++i; + do + --beg; + while (beg[-1] != eol); + } + + /* Detect whether leading context is adjacent to previous output. */ + if (beg != lastout) + lastout = 0; + + /* Handle some details and read more data to scan. */ + save = residue + lim - beg; + if (out_byte) + totalcc = add_count (totalcc, buflim - bufbeg - save); + if (out_line) + nlscan (beg); + if (! fillbuf (save, st)) + { + suppressible_error (filename, errno); + goto finish_grep; + } + } + if (residue) + { + *buflim++ = eol; + if (outleft) + nlines += grepbuf (bufbeg + save - residue, buflim); + if (pending) + prpending (buflim); + } + + finish_grep: + done_on_match = done_on_match_0; + out_quiet = out_quiet_0; + if (!out_quiet && (encoding_error_output + || (0 <= nlines_first_null && nlines_first_null < nlines))) + { + printf_errno (_("Binary file %s matches\n"), filename); + if (line_buffered) + fflush_errno (); + } + return nlines; +} + +static bool +grepdirent (FTS *fts, FTSENT *ent, bool command_line) +{ + bool follow; + int dirdesc; + command_line &= ent->fts_level == FTS_ROOTLEVEL; + + if (ent->fts_info == FTS_DP) + { + if (directories == RECURSE_DIRECTORIES && command_line) + out_file &= ~ (2 * !no_filenames); + return true; + } + + if (!command_line + && skipped_file (ent->fts_name, false, + (ent->fts_info == FTS_D || ent->fts_info == FTS_DC + || ent->fts_info == FTS_DNR))) + { + fts_set (fts, ent, FTS_SKIP); + return true; + } + + filename = ent->fts_path; + if (omit_dot_slash && filename[1]) + filename += 2; + follow = (fts->fts_options & FTS_LOGICAL + || (fts->fts_options & FTS_COMFOLLOW && command_line)); + + switch (ent->fts_info) + { + case FTS_D: + if (directories == RECURSE_DIRECTORIES) + { + out_file |= 2 * !no_filenames; + return true; + } + fts_set (fts, ent, FTS_SKIP); + break; + + case FTS_DC: + if (!suppress_errors) + error (0, 0, _("warning: %s: %s"), filename, + _("recursive directory loop")); + return true; + + case FTS_DNR: + case FTS_ERR: + case FTS_NS: + suppressible_error (filename, ent->fts_errno); + return true; + + case FTS_DEFAULT: + case FTS_NSOK: + if (skip_devices (command_line)) + { + struct stat *st = ent->fts_statp; + struct stat st1; + if (! st->st_mode) + { + /* The file type is not already known. Get the file status + before opening, since opening might have side effects + on a device. */ + int flag = follow ? 0 : AT_SYMLINK_NOFOLLOW; + if (fstatat (fts->fts_cwd_fd, ent->fts_accpath, &st1, flag) != 0) + { + suppressible_error (filename, errno); + return true; + } + st = &st1; + } + if (is_device_mode (st->st_mode)) + return true; + } + break; + + case FTS_F: + case FTS_SLNONE: + break; + + case FTS_SL: + case FTS_W: + return true; + + default: + abort (); + } + + dirdesc = ((fts->fts_options & (FTS_NOCHDIR | FTS_CWDFD)) == FTS_CWDFD + ? fts->fts_cwd_fd + : AT_FDCWD); + return grepfile (dirdesc, ent->fts_accpath, follow, command_line); +} + +/* True if errno is ERR after 'open ("symlink", ... O_NOFOLLOW ...)'. + POSIX specifies ELOOP, but it's EMLINK on FreeBSD and EFTYPE on NetBSD. */ +static bool +open_symlink_nofollow_error (int err) +{ + if (err == ELOOP || err == EMLINK) + return true; +#ifdef EFTYPE + if (err == EFTYPE) + return true; +#endif + return false; +} + +static bool +grepfile (int dirdesc, char const *name, bool follow, bool command_line) +{ + int oflag = (O_RDONLY | O_NOCTTY + | (follow ? 0 : O_NOFOLLOW) + | (skip_devices (command_line) ? O_NONBLOCK : 0)); + int desc = openat_safer (dirdesc, name, oflag); + if (desc < 0) + { + if (follow || ! open_symlink_nofollow_error (errno)) + suppressible_error (filename, errno); + return true; + } + return grepdesc (desc, command_line); +} + +static bool +grepdesc (int desc, bool command_line) +{ + intmax_t count; + bool status = true; + struct stat st; + + /* Get the file status, possibly for the second time. This catches + a race condition if the directory entry changes after the + directory entry is read and before the file is opened. For + example, normally DESC is a directory only at the top level, but + there is an exception if some other process substitutes a + directory for a non-directory while 'grep' is running. */ + if (fstat (desc, &st) != 0) + { + suppressible_error (filename, errno); + goto closeout; + } + + if (desc != STDIN_FILENO && skip_devices (command_line) + && is_device_mode (st.st_mode)) + goto closeout; + + if (desc != STDIN_FILENO && command_line + && skipped_file (filename, true, S_ISDIR (st.st_mode) != 0)) + goto closeout; + + if (desc != STDIN_FILENO + && directories == RECURSE_DIRECTORIES && S_ISDIR (st.st_mode)) + { + /* Traverse the directory starting with its full name, because + unfortunately fts provides no way to traverse the directory + starting from its file descriptor. */ + + FTS *fts; + FTSENT *ent; + int opts = fts_options & ~(command_line ? 0 : FTS_COMFOLLOW); + char *fts_arg[2]; + + /* Close DESC now, to conserve file descriptors if the race + condition occurs many times in a deep recursion. */ + if (close (desc) != 0) + suppressible_error (filename, errno); + + fts_arg[0] = (char *) filename; + fts_arg[1] = NULL; + fts = fts_open (fts_arg, opts, NULL); + + if (!fts) + xalloc_die (); + while ((ent = fts_read (fts))) + status &= grepdirent (fts, ent, command_line); + if (errno) + suppressible_error (filename, errno); + if (fts_close (fts) != 0) + suppressible_error (filename, errno); + return status; + } + if (desc != STDIN_FILENO + && ((directories == SKIP_DIRECTORIES && S_ISDIR (st.st_mode)) + || ((devices == SKIP_DEVICES + || (devices == READ_COMMAND_LINE_DEVICES && !command_line)) + && is_device_mode (st.st_mode)))) + goto closeout; + + /* If there is a regular file on stdout and the current file refers + to the same i-node, we have to report the problem and skip it. + Otherwise when matching lines from some other input reach the + disk before we open this file, we can end up reading and matching + those lines and appending them to the file from which we're reading. + Then we'd have what appears to be an infinite loop that'd terminate + only upon filling the output file system or reaching a quota. + However, there is no risk of an infinite loop if grep is generating + no output, i.e., with --silent, --quiet, -q. + Similarly, with any of these: + --max-count=N (-m) (for N >= 2) + --files-with-matches (-l) + --files-without-match (-L) + there is no risk of trouble. + For --max-count=1, grep stops after printing the first match, + so there is no risk of malfunction. But even --max-count=2, with + input==output, while there is no risk of infloop, there is a race + condition that could result in "alternate" output. */ + if (!out_quiet && list_files == 0 && 1 < max_count + && S_ISREG (out_stat.st_mode) && out_stat.st_ino + && SAME_INODE (st, out_stat)) + { + if (! suppress_errors) + error (0, 0, _("input file %s is also the output"), quote (filename)); + errseen = true; + goto closeout; + } + +#if defined SET_BINARY + /* Set input to binary mode. Pipes are simulated with files + on DOS, so this includes the case of "foo | grep bar". */ + if (!isatty (desc)) + SET_BINARY (desc); +#endif + + count = grep (desc, &st); + if (count < 0) + status = count + 2; + else + { + if (count_matches) + { + if (out_file) + { + print_filename (); + if (filename_mask) + print_sep (SEP_CHAR_SELECTED); + else + putchar_errno (0); + } + printf_errno ("%" PRIdMAX "\n", count); + if (line_buffered) + fflush_errno (); + } + + status = !count; + if (list_files == 1 - 2 * status) + { + print_filename (); + putchar_errno ('\n' & filename_mask); + if (line_buffered) + fflush_errno (); + } + + if (desc == STDIN_FILENO) + { + off_t required_offset = outleft ? bufoffset : after_last_match; + if (required_offset != bufoffset + && lseek (desc, required_offset, SEEK_SET) < 0 + && S_ISREG (st.st_mode)) + suppressible_error (filename, errno); + } + } + + closeout: + if (desc != STDIN_FILENO && close (desc) != 0) + suppressible_error (filename, errno); + return status; +} + +static bool +grep_command_line_arg (char const *arg) +{ + if (STREQ (arg, "-")) + { + filename = label ? label : _("(standard input)"); + return grepdesc (STDIN_FILENO, true); + } + else + { + filename = arg; + return grepfile (AT_FDCWD, arg, true, true); + } +} + +_Noreturn void usage (int); +void +usage (int status) +{ + if (status != 0) + { + fprintf (stderr, _("Usage: %s [OPTION]... PATTERN [FILE]...\n"), + program_name); + fprintf (stderr, _("Try '%s --help' for more information.\n"), + program_name); + } + else + { + printf (_("Usage: %s [OPTION]... PATTERN [FILE]...\n"), program_name); + printf (_("Search for PATTERN in each FILE or standard input.\n")); + printf (_("PATTERN is, by default, a basic regular expression (BRE).\n")); + printf (_("\ +Example: %s -i 'hello world' menu.h main.c\n\ +\n\ +Regexp selection and interpretation:\n"), program_name); + printf (_("\ + -E, --extended-regexp PATTERN is an extended regular expression (ERE)\n\ + -F, --fixed-strings PATTERN is a set of newline-separated strings\n\ + -G, --basic-regexp PATTERN is a basic regular expression (BRE)\n\ + -P, --perl-regexp PATTERN is a Perl regular expression\n")); + /* -X is deliberately undocumented. */ + printf (_("\ + -e, --regexp=PATTERN use PATTERN for matching\n\ + -f, --file=FILE obtain PATTERN from FILE\n\ + -i, --ignore-case ignore case distinctions\n\ + -w, --word-regexp force PATTERN to match only whole words\n\ + -x, --line-regexp force PATTERN to match only whole lines\n\ + -z, --null-data a data line ends in 0 byte, not newline\n")); + printf (_("\ +\n\ +Miscellaneous:\n\ + -s, --no-messages suppress error messages\n\ + -v, --invert-match select non-matching lines\n\ + -V, --version display version information and exit\n\ + --help display this help text and exit\n")); + printf (_("\ +\n\ +Output control:\n\ + -m, --max-count=NUM stop after NUM matches\n\ + -b, --byte-offset print the byte offset with output lines\n\ + -n, --line-number print line number with output lines\n\ + --line-buffered flush output on every line\n\ + -H, --with-filename print the file name for each match\n\ + -h, --no-filename suppress the file name prefix on output\n\ + --label=LABEL use LABEL as the standard input file name prefix\n\ +")); + printf (_("\ + -o, --only-matching show only the part of a line matching PATTERN\n\ + -q, --quiet, --silent suppress all normal output\n\ + --binary-files=TYPE assume that binary files are TYPE;\n\ + TYPE is 'binary', 'text', or 'without-match'\n\ + -a, --text equivalent to --binary-files=text\n\ +")); + printf (_("\ + -I equivalent to --binary-files=without-match\n\ + -d, --directories=ACTION how to handle directories;\n\ + ACTION is 'read', 'recurse', or 'skip'\n\ + -D, --devices=ACTION how to handle devices, FIFOs and sockets;\n\ + ACTION is 'read' or 'skip'\n\ + -r, --recursive like --directories=recurse\n\ + -R, --dereference-recursive likewise, but follow all symlinks\n\ +")); + printf (_("\ + --include=FILE_PATTERN search only files that match FILE_PATTERN\n\ + --exclude=FILE_PATTERN skip files and directories matching\ + FILE_PATTERN\n\ + --exclude-from=FILE skip files matching any file pattern from FILE\n\ + --exclude-dir=PATTERN directories that match PATTERN will be skipped.\n\ +")); + printf (_("\ + -L, --files-without-match print only names of FILEs containing no match\n\ + -l, --files-with-matches print only names of FILEs containing matches\n\ + -c, --count print only a count of matching lines per FILE\n\ + -T, --initial-tab make tabs line up (if needed)\n\ + -Z, --null print 0 byte after FILE name\n")); + printf (_("\ +\n\ +Context control:\n\ + -B, --before-context=NUM print NUM lines of leading context\n\ + -A, --after-context=NUM print NUM lines of trailing context\n\ + -C, --context=NUM print NUM lines of output context\n\ +")); + printf (_("\ + -NUM same as --context=NUM\n\ + --color[=WHEN],\n\ + --colour[=WHEN] use markers to highlight the matching strings;\n\ + WHEN is 'always', 'never', or 'auto'\n\ + -U, --binary do not strip CR characters at EOL (MSDOS/Windows)\n\ + -u, --unix-byte-offsets report offsets as if CRs were not there\n\ + (MSDOS/Windows)\n\ +\n")); + printf (_("\ +'egrep' means 'grep -E'. 'fgrep' means 'grep -F'.\n\ +Direct invocation as either 'egrep' or 'fgrep' is deprecated.\n")); + printf (_("\ +When FILE is -, read standard input. With no FILE, read . if a command-line\n\ +-r is given, - otherwise. If fewer than two FILEs are given, assume -h.\n\ +Exit status is 0 if any line is selected, 1 otherwise;\n\ +if any error occurs and -q is not given, the exit status is 2.\n")); + emit_bug_reporting_address (); + } + exit (status); +} + +/* Pattern compilers and matchers. */ + +static void +Gcompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_GREP); +} + +static void +Ecompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_EGREP); +} + +static void +Acompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_AWK); +} + +static void +GAcompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_GNU_AWK); +} + +static void +PAcompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_POSIX_AWK); +} + +struct matcher +{ + char const name[16]; + compile_fp_t compile; + execute_fp_t execute; +}; +static struct matcher const matchers[] = { + { "grep", Gcompile, EGexecute }, + { "egrep", Ecompile, EGexecute }, + { "fgrep", Fcompile, Fexecute }, + { "awk", Acompile, EGexecute }, + { "gawk", GAcompile, EGexecute }, + { "posixawk", PAcompile, EGexecute }, + { "perl", Pcompile, Pexecute }, + { "", NULL, NULL }, +}; + +/* Set the matcher to M if available. Exit in case of conflicts or if + M is not available. */ +static void +setmatcher (char const *m) +{ + struct matcher const *p; + + if (matcher && !STREQ (matcher, m)) + error (EXIT_TROUBLE, 0, _("conflicting matchers specified")); + + for (p = matchers; p->compile; p++) + if (STREQ (m, p->name)) + { + matcher = p->name; + compile = p->compile; + execute = p->execute; + return; + } + + error (EXIT_TROUBLE, 0, _("invalid matcher %s"), m); +} + +/* Find the white-space-separated options specified by OPTIONS, and + using BUF to store copies of these options, set ARGV[0], ARGV[1], + etc. to the option copies. Return the number N of options found. + Do not set ARGV[N] to NULL. If ARGV is NULL, do not store ARGV[0] + etc. Backslash can be used to escape whitespace (and backslashes). */ +static size_t +prepend_args (char const *options, char *buf, char **argv) +{ + char const *o = options; + char *b = buf; + size_t n = 0; + + for (;;) + { + while (c_isspace (to_uchar (*o))) + o++; + if (!*o) + return n; + if (argv) + argv[n] = b; + n++; + + do + if ((*b++ = *o++) == '\\' && *o) + b[-1] = *o++; + while (*o && ! c_isspace (to_uchar (*o))); + + *b++ = '\0'; + } +} + +/* Prepend the whitespace-separated options in OPTIONS to the argument + vector of a main program with argument count *PARGC and argument + vector *PARGV. Return the number of options prepended. */ +static int +prepend_default_options (char const *options, int *pargc, char ***pargv) +{ + if (options && *options) + { + char *buf = xmalloc (strlen (options) + 1); + size_t prepended = prepend_args (options, buf, NULL); + int argc = *pargc; + char *const *argv = *pargv; + char **pp; + enum { MAX_ARGS = MIN (INT_MAX, SIZE_MAX / sizeof *pp - 1) }; + if (MAX_ARGS - argc < prepended) + xalloc_die (); + pp = xmalloc ((prepended + argc + 1) * sizeof *pp); + *pargc = prepended + argc; + *pargv = pp; + *pp++ = *argv++; + pp += prepend_args (options, buf, pp); + while ((*pp++ = *argv++)) + continue; + return prepended; + } + + return 0; +} + +/* Get the next non-digit option from ARGC and ARGV. + Return -1 if there are no more options. + Process any digit options that were encountered on the way, + and store the resulting integer into *DEFAULT_CONTEXT. */ +static int +get_nondigit_option (int argc, char *const *argv, intmax_t *default_context) +{ + static int prev_digit_optind = -1; + int this_digit_optind; + bool was_digit; + char buf[INT_BUFSIZE_BOUND (intmax_t) + 4]; + char *p = buf; + int opt; + + was_digit = false; + this_digit_optind = optind; + while (true) + { + opt = getopt_long (argc, (char **) argv, short_options, + long_options, NULL); + if ( ! ('0' <= opt && opt <= '9')) + break; + + if (prev_digit_optind != this_digit_optind || !was_digit) + { + /* Reset to start another context length argument. */ + p = buf; + } + else + { + /* Suppress trivial leading zeros, to avoid incorrect + diagnostic on strings like 00000000000. */ + p -= buf[0] == '0'; + } + + if (p == buf + sizeof buf - 4) + { + /* Too many digits. Append "..." to make context_length_arg + complain about "X...", where X contains the digits seen + so far. */ + strcpy (p, "..."); + p += 3; + break; + } + *p++ = opt; + + was_digit = true; + prev_digit_optind = this_digit_optind; + this_digit_optind = optind; + } + if (p != buf) + { + *p = '\0'; + context_length_arg (buf, default_context); + } + + return opt; +} + +/* Parse GREP_COLORS. The default would look like: + GREP_COLORS='ms=01;31:mc=01;31:sl=:cx=:fn=35:ln=32:bn=32:se=36' + with boolean capabilities (ne and rv) unset (i.e., omitted). + No character escaping is needed or supported. */ +static void +parse_grep_colors (void) +{ + const char *p; + char *q; + char *name; + char *val; + + p = getenv ("GREP_COLORS"); /* Plural! */ + if (p == NULL || *p == '\0') + return; + + /* Work off a writable copy. */ + q = xstrdup (p); + + name = q; + val = NULL; + /* From now on, be well-formed or you're gone. */ + for (;;) + if (*q == ':' || *q == '\0') + { + char c = *q; + struct color_cap const *cap; + + *q++ = '\0'; /* Terminate name or val. */ + /* Empty name without val (empty cap) + * won't match and will be ignored. */ + for (cap = color_dict; cap->name; cap++) + if (STREQ (cap->name, name)) + break; + /* If name unknown, go on for forward compatibility. */ + if (cap->var && val) + *(cap->var) = val; + if (cap->fct) + cap->fct (); + if (c == '\0') + return; + name = q; + val = NULL; + } + else if (*q == '=') + { + if (q == name || val) + return; + *q++ = '\0'; /* Terminate name. */ + val = q; /* Can be the empty string. */ + } + else if (val == NULL) + q++; /* Accumulate name. */ + else if (*q == ';' || (*q >= '0' && *q <= '9')) + q++; /* Accumulate val. Protect the terminal from being sent crap. */ + else + return; +} + +/* Return true if PAT (of length PATLEN) contains an encoding error. */ +static bool +contains_encoding_error (char const *pat, size_t patlen) +{ + mbstate_t mbs = { 0 }; + size_t i, charlen; + + for (i = 0; i < patlen; i += charlen) + { + charlen = mb_clen (pat + i, patlen - i, &mbs); + if ((size_t) -2 <= charlen) + return true; + } + return false; +} + +/* Change a pattern for fgrep into grep. */ +static void +fgrep_to_grep_pattern (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + char *p = *new_keys = xnmalloc (len + 1, 2); + mbstate_t mb_state = { 0 }; + size_t n; + + for (; len; keys += n, len -= n) + { + n = mb_clen (keys, len, &mb_state); + switch (n) + { + case (size_t) -2: + n = len; + /* Fall through. */ + default: + p = mempcpy (p, keys, n); + break; + + case (size_t) -1: + memset (&mb_state, 0, sizeof mb_state); + /* Fall through. */ + case 1: + *p = '\\'; + p += strchr ("$*.[\\^", *keys) != NULL; + /* Fall through. */ + case 0: + *p++ = *keys; + n = 1; + break; + } + } + + *new_len = p - *new_keys; +} + +int +main (int argc, char **argv) +{ + char *keys; + size_t keycc, oldcc, keyalloc; + bool with_filenames; + size_t cc; + int opt, prepended; + int prev_optind, last_recursive; + int fread_errno; + intmax_t default_context; + FILE *fp; + exit_failure = EXIT_TROUBLE; + initialize_main (&argc, &argv); + set_program_name (argv[0]); + program_name = argv[0]; + + keys = NULL; + keycc = 0; + with_filenames = false; + eolbyte = '\n'; + filename_mask = ~0; + + max_count = INTMAX_MAX; + + /* The value -1 means to use DEFAULT_CONTEXT. */ + out_after = out_before = -1; + /* Default before/after context: changed by -C/-NUM options */ + default_context = -1; + /* Changed by -o option */ + only_matching = false; + + /* Internationalization. */ +#if defined HAVE_SETLOCALE + setlocale (LC_ALL, ""); +#endif +#if defined ENABLE_NLS + bindtextdomain (PACKAGE, LOCALEDIR); + textdomain (PACKAGE); +#endif + + exit_failure = EXIT_TROUBLE; + atexit (clean_up_stdout); + + last_recursive = 0; + + prepended = prepend_default_options (getenv ("GREP_OPTIONS"), &argc, &argv); + if (prepended) + error (0, 0, _("warning: GREP_OPTIONS is deprecated;" + " please use an alias or script")); + + compile = matchers[0].compile; + execute = matchers[0].execute; + + while (prev_optind = optind, + (opt = get_nondigit_option (argc, argv, &default_context)) != -1) + switch (opt) + { + case 'A': + context_length_arg (optarg, &out_after); + break; + + case 'B': + context_length_arg (optarg, &out_before); + break; + + case 'C': + /* Set output match context, but let any explicit leading or + trailing amount specified with -A or -B stand. */ + context_length_arg (optarg, &default_context); + break; + + case 'D': + if (STREQ (optarg, "read")) + devices = READ_DEVICES; + else if (STREQ (optarg, "skip")) + devices = SKIP_DEVICES; + else + error (EXIT_TROUBLE, 0, _("unknown devices method")); + break; + + case 'E': + setmatcher ("egrep"); + break; + + case 'F': + setmatcher ("fgrep"); + break; + + case 'P': + setmatcher ("perl"); + break; + + case 'G': + setmatcher ("grep"); + break; + + case 'X': /* undocumented on purpose */ + setmatcher (optarg); + break; + + case 'H': + with_filenames = true; + no_filenames = false; + break; + + case 'I': + binary_files = WITHOUT_MATCH_BINARY_FILES; + break; + + case 'T': + align_tabs = true; + break; + + case 'U': + dos_binary (); + break; + + case 'u': + dos_unix_byte_offsets (); + break; + + case 'V': + show_version = true; + break; + + case 'a': + binary_files = TEXT_BINARY_FILES; + break; + + case 'b': + out_byte = true; + break; + + case 'c': + count_matches = true; + break; + + case 'd': + directories = XARGMATCH ("--directories", optarg, + directories_args, directories_types); + if (directories == RECURSE_DIRECTORIES) + last_recursive = prev_optind; + break; + + case 'e': + cc = strlen (optarg); + keys = xrealloc (keys, keycc + cc + 1); + strcpy (&keys[keycc], optarg); + keycc += cc; + keys[keycc++] = '\n'; + break; + + case 'f': + fp = STREQ (optarg, "-") ? stdin : fopen (optarg, O_TEXT ? "rt" : "r"); + if (!fp) + error (EXIT_TROUBLE, errno, "%s", optarg); + for (keyalloc = 1; keyalloc <= keycc + 1; keyalloc *= 2) + ; + keys = xrealloc (keys, keyalloc); + oldcc = keycc; + while ((cc = fread (keys + keycc, 1, keyalloc - 1 - keycc, fp)) != 0) + { + keycc += cc; + if (keycc == keyalloc - 1) + keys = x2nrealloc (keys, &keyalloc, sizeof *keys); + } + fread_errno = errno; + if (ferror (fp)) + error (EXIT_TROUBLE, fread_errno, "%s", optarg); + if (fp != stdin) + fclose (fp); + /* Append final newline if file ended in non-newline. */ + if (oldcc != keycc && keys[keycc - 1] != '\n') + keys[keycc++] = '\n'; + break; + + case 'h': + with_filenames = false; + no_filenames = true; + break; + + case 'i': + case 'y': /* For old-timers . . . */ + match_icase = true; + break; + + case 'L': + /* Like -l, except list files that don't contain matches. + Inspired by the same option in Hume's gre. */ + list_files = -1; + break; + + case 'l': + list_files = 1; + break; + + case 'm': + switch (xstrtoimax (optarg, 0, 10, &max_count, "")) + { + case LONGINT_OK: + case LONGINT_OVERFLOW: + break; + + default: + error (EXIT_TROUBLE, 0, _("invalid max count")); + } + break; + + case 'n': + out_line = true; + break; + + case 'o': + only_matching = true; + break; + + case 'q': + exit_on_match = true; + exit_failure = 0; + break; + + case 'R': + fts_options = basic_fts_options | FTS_LOGICAL; + /* Fall through. */ + case 'r': + directories = RECURSE_DIRECTORIES; + last_recursive = prev_optind; + break; + + case 's': + suppress_errors = true; + break; + + case 'v': + out_invert = true; + break; + + case 'w': + match_words = true; + break; + + case 'x': + match_lines = true; + break; + + case 'Z': + filename_mask = 0; + break; + + case 'z': + eolbyte = '\0'; + break; + + case BINARY_FILES_OPTION: + if (STREQ (optarg, "binary")) + binary_files = BINARY_BINARY_FILES; + else if (STREQ (optarg, "text")) + binary_files = TEXT_BINARY_FILES; + else if (STREQ (optarg, "without-match")) + binary_files = WITHOUT_MATCH_BINARY_FILES; + else + error (EXIT_TROUBLE, 0, _("unknown binary-files type")); + break; + + case COLOR_OPTION: + if (optarg) + { + if (!strcasecmp (optarg, "always") || !strcasecmp (optarg, "yes") + || !strcasecmp (optarg, "force")) + color_option = 1; + else if (!strcasecmp (optarg, "never") || !strcasecmp (optarg, "no") + || !strcasecmp (optarg, "none")) + color_option = 0; + else if (!strcasecmp (optarg, "auto") || !strcasecmp (optarg, "tty") + || !strcasecmp (optarg, "if-tty")) + color_option = 2; + else + show_help = 1; + } + else + color_option = 2; + break; + + case EXCLUDE_OPTION: + case INCLUDE_OPTION: + for (int cmd = 0; cmd < 2; cmd++) + { + if (!excluded_patterns[cmd]) + excluded_patterns[cmd] = new_exclude (); + add_exclude (excluded_patterns[cmd], optarg, + ((opt == INCLUDE_OPTION ? EXCLUDE_INCLUDE : 0) + | exclude_options (cmd))); + } + break; + case EXCLUDE_FROM_OPTION: + for (int cmd = 0; cmd < 2; cmd++) + { + if (!excluded_patterns[cmd]) + excluded_patterns[cmd] = new_exclude (); + if (add_exclude_file (add_exclude, excluded_patterns[cmd], + optarg, exclude_options (cmd), '\n') + != 0) + error (EXIT_TROUBLE, errno, "%s", optarg); + } + break; + + case EXCLUDE_DIRECTORY_OPTION: + strip_trailing_slashes (optarg); + for (int cmd = 0; cmd < 2; cmd++) + { + if (!excluded_directory_patterns[cmd]) + excluded_directory_patterns[cmd] = new_exclude (); + add_exclude (excluded_directory_patterns[cmd], optarg, + exclude_options (cmd)); + } + break; + + case GROUP_SEPARATOR_OPTION: + group_separator = optarg; + break; + + case LINE_BUFFERED_OPTION: + line_buffered = true; + break; + + case LABEL_OPTION: + label = optarg; + break; + + case 0: + /* long options */ + break; + + default: + usage (EXIT_TROUBLE); + break; + + } + + if (color_option == 2) + color_option = isatty (STDOUT_FILENO) && should_colorize (); + init_colorize (); + + /* POSIX says that -q overrides -l, which in turn overrides the + other output options. */ + if (exit_on_match) + list_files = 0; + if (exit_on_match | list_files) + { + count_matches = false; + done_on_match = true; + } + out_quiet = count_matches | done_on_match; + + if (out_after < 0) + out_after = default_context; + if (out_before < 0) + out_before = default_context; + + if (color_option) + { + /* Legacy. */ + char *userval = getenv ("GREP_COLOR"); + if (userval != NULL && *userval != '\0') + selected_match_color = context_match_color = userval; + + /* New GREP_COLORS has priority. */ + parse_grep_colors (); + } + + if (show_version) + { + version_etc (stdout, program_name, PACKAGE_NAME, VERSION, AUTHORS, + (char *) NULL); + return EXIT_SUCCESS; + } + + if (show_help) + usage (EXIT_SUCCESS); + + struct stat tmp_stat; + if (fstat (STDOUT_FILENO, &tmp_stat) == 0 && S_ISREG (tmp_stat.st_mode)) + out_stat = tmp_stat; + + if (keys) + { + if (keycc == 0) + { + /* No keys were specified (e.g. -f /dev/null). Match nothing. */ + out_invert ^= true; + match_lines = match_words = false; + } + else + /* Strip trailing newline. */ + --keycc; + } + else if (optind < argc) + { + /* A copy must be made in case of an xrealloc() or free() later. */ + keycc = strlen (argv[optind]); + keys = xmemdup (argv[optind++], keycc + 1); + } + else + usage (EXIT_TROUBLE); + + build_mbclen_cache (); + initialize_unibyte_mask (); + + /* In a unibyte locale, switch from fgrep to grep if + the pattern matches words (where grep is typically faster). + In a multibyte locale, switch from fgrep to grep if either + (1) case is ignored (where grep is typically faster), or + (2) the pattern has an encoding error (where fgrep might not work). */ + if (compile == Fcompile + && (MB_CUR_MAX <= 1 + ? match_words + : match_icase || contains_encoding_error (keys, keycc))) + { + size_t new_keycc; + char *new_keys; + fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys); + free (keys); + keys = new_keys; + keycc = new_keycc; + matcher = "grep"; + compile = Gcompile; + execute = EGexecute; + } + + compile (keys, keycc); + free (keys); + /* We need one byte prior and one after. */ + char eolbytes[3] = { 0, eolbyte, 0 }; + size_t match_size; + skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0) + == out_invert); + + if ((argc - optind > 1 && !no_filenames) || with_filenames) + out_file = 1; + +#ifdef SET_BINARY + /* Output is set to binary mode because we shouldn't convert + NL to CR-LF pairs, especially when grepping binary files. */ + if (!isatty (STDOUT_FILENO)) + SET_BINARY (STDOUT_FILENO); +#endif + + if (max_count == 0) + return EXIT_FAILURE; + + if (fts_options & FTS_LOGICAL && devices == READ_COMMAND_LINE_DEVICES) + devices = READ_DEVICES; + + char *const *files; + if (optind < argc) + { + files = argv + optind; + } + else if (directories == RECURSE_DIRECTORIES && prepended < last_recursive) + { + static char *const cwd_only[] = { (char *) ".", NULL }; + files = cwd_only; + omit_dot_slash = true; + } + else + { + static char *const stdin_only[] = { (char *) "-", NULL }; + files = stdin_only; + } + + bool status = true; + do + status &= grep_command_line_arg (*files++); + while (*files != NULL); + + /* We register via atexit() to test stdout. */ + return errseen ? EXIT_TROUBLE : status; +} diff --git a/src/grep.h b/src/grep.h new file mode 100644 index 0000000..75b7ef7 --- /dev/null +++ b/src/grep.h @@ -0,0 +1,34 @@ +/* grep.h - interface to grep driver for searching subroutines. + Copyright (C) 1992, 1998, 2001, 2007, 2009-2016 Free Software Foundation, + Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#ifndef GREP_GREP_H +#define GREP_GREP_H 1 + +#include + +/* The following flags are exported from grep for the matchers + to look at. */ +extern bool match_icase; /* -i */ +extern bool match_words; /* -w */ +extern bool match_lines; /* -x */ +extern char eolbyte; /* -z */ + +extern bool buf_has_encoding_errors (char *, size_t); + +#endif diff --git a/src/kwsearch.c b/src/kwsearch.c new file mode 100644 index 0000000..e9966d4 --- /dev/null +++ b/src/kwsearch.c @@ -0,0 +1,165 @@ +/* kwsearch.c - searching subroutines using kwset for grep. + Copyright 1992, 1998, 2000, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include +#include "search.h" + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +/* KWset compiled pattern. For Ecompile and Gcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ +static kwset_t kwset; + +void +Fcompile (char const *pattern, size_t size) +{ + size_t total = size; + + kwsinit (&kwset); + + char const *p = pattern; + do + { + size_t len; + char const *sep = memchr (p, '\n', total); + if (sep) + { + len = sep - p; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + char *buf = NULL; + if (match_lines) + { + buf = xmalloc (len + 2); + buf[0] = eolbyte; + memcpy (buf + 1, p, len); + buf[len + 1] = eolbyte; + p = buf; + len += 2; + } + kwsincr (kwset, p, len); + free (buf); + + p = sep; + } + while (p); + + kwsprep (kwset); +} + +size_t +Fexecute (char *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ + char const *beg, *try, *end, *mb_start; + size_t len; + char eol = eolbyte; + struct kwsmatch kwsmatch; + size_t ret_val; + + for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) + { + size_t offset = kwsexec (kwset, beg - match_lines, + buf + size - beg + match_lines, &kwsmatch); + if (offset == (size_t) -1) + goto failure; + len = kwsmatch.size[0] - 2 * match_lines; + if (!match_lines && MB_CUR_MAX > 1 && !using_utf8 () + && mb_goback (&mb_start, beg + offset, buf + size) != 0) + { + /* We have matched a single byte that is not at the beginning of a + multibyte character. mb_goback has advanced MB_START past that + multibyte character. Now, we want to position BEG so that the + next kwsexec search starts there. Thus, to compensate for the + for-loop's BEG++, above, subtract one here. This code is + unusually hard to reach, and exceptionally, let's show how to + trigger it here: + + printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A + + That assumes the named locale is installed. + Note that your system's shift-JIS locale may have a different + name, possibly including "sjis". */ + beg = mb_start - 1; + continue; + } + beg += offset; + if (start_ptr && !match_words) + goto success_in_beg_and_len; + if (match_lines) + { + len += start_ptr == NULL; + goto success_in_beg_and_len; + } + if (match_words) + for (try = beg; ; ) + { + char const *bol = memrchr (buf, eol, beg - buf); + bol = bol ? bol + 1 : buf; + if (wordchar (mb_prev_wc (bol, try, buf + size))) + break; + if (wordchar (mb_next_wc (try + len, buf + size))) + { + if (!len) + break; + offset = kwsexec (kwset, beg, --len, &kwsmatch); + if (offset == (size_t) -1) + break; + try = beg + offset; + len = kwsmatch.size[0]; + } + else if (!start_ptr) + goto success; + else + goto success_in_beg_and_len; + } /* for (try) */ + else + goto success; + } /* for (beg in buf) */ + + failure: + return -1; + + success: + end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end = end ? end + 1 : buf + size; + beg = memrchr (buf, eol, beg - buf); + beg = beg ? beg + 1 : buf; + len = end - beg; + success_in_beg_and_len:; + size_t off = beg - buf; + + *match_size = len; + ret_val = off; + return ret_val; +} diff --git a/src/kwset.c b/src/kwset.c new file mode 100644 index 0000000..03520b6 --- /dev/null +++ b/src/kwset.c @@ -0,0 +1,868 @@ +/* kwset.c - search for any of a set of keywords. + Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2016 Free Software + Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1989 by Mike Haertel. + The author may be reached (Email) at the address mike@ai.mit.edu, + or (US mail) as Mike Haertel c/o Free Software Foundation. */ + +/* The algorithm implemented by these routines bears a startling resemblance + to one discovered by Beate Commentz-Walter, although it is not identical. + See: Commentz-Walter B. A string matching algorithm fast on the average. + Lecture Notes in Computer Science 71 (1979), 118-32 + . + See also: Aho AV, Corasick MJ. Efficient string matching: an aid to + bibliographic search. CACM 18, 6 (1975), 333-40 + , which describes the + failure function used below. */ + +#include + +#include "kwset.h" + +#include +#include +#include +#include "system.h" +#include "memchr2.h" +#include "obstack.h" +#include "xalloc.h" + +#define link kwset_link + +#ifdef GREP +# include "xalloc.h" +# undef malloc +# define malloc xmalloc +#endif + +#define NCHAR (UCHAR_MAX + 1) +#define obstack_chunk_alloc malloc +#define obstack_chunk_free free + +#define U(c) (to_uchar (c)) + +/* Balanced tree of edges and labels leaving a given trie node. */ +struct tree +{ + struct tree *llink; /* Left link; MUST be first field. */ + struct tree *rlink; /* Right link (to larger labels). */ + struct trie *trie; /* Trie node pointed to by this edge. */ + unsigned char label; /* Label on this edge. */ + char balance; /* Difference in depths of subtrees. */ +}; + +/* Node of a trie representing a set of reversed keywords. */ +struct trie +{ + size_t accepting; /* Word index of accepted word, or zero. */ + struct tree *links; /* Tree of edges leaving this node. */ + struct trie *parent; /* Parent of this node. */ + struct trie *next; /* List of all trie nodes in level order. */ + struct trie *fail; /* Aho-Corasick failure function. */ + int depth; /* Depth of this node from the root. */ + int shift; /* Shift function for search failures. */ + int maxshift; /* Max shift of self and descendants. */ +}; + +/* Structure returned opaquely to the caller, containing everything. */ +struct kwset +{ + struct obstack obstack; /* Obstack for node allocation. */ + ptrdiff_t words; /* Number of words in the trie. */ + struct trie *trie; /* The trie itself. */ + int mind; /* Minimum depth of an accepting node. */ + int maxd; /* Maximum depth of any node. */ + unsigned char delta[NCHAR]; /* Delta table for rapid search. */ + struct trie *next[NCHAR]; /* Table of children of the root. */ + char *target; /* Target string if there's only one. */ + int *shift; /* Used in Boyer-Moore search for one string. */ + char const *trans; /* Character translation table. */ + + /* If there's only one string, this is the string's last byte, + translated via TRANS if TRANS is nonnull. */ + char gc1; + + /* Likewise for the string's penultimate byte, if it has two or more + bytes. */ + char gc2; + + /* If there's only one string, this helps to match the string's last byte. + If GC1HELP is negative, only GC1 matches the string's last byte; + otherwise at least two bytes match, and B matches if TRANS[B] == GC1. + If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two + such matches, and GC1HELP is the other match after conversion to + unsigned char. If GC1HELP is at least NCHAR, there are three or + more such matches; e.g., Greek has three sigma characters that + all match when case-folding. */ + int gc1help; +}; + +/* Use TRANS to transliterate C. A null TRANS does no transliteration. */ +static inline char +tr (char const *trans, char c) +{ + return trans ? trans[U(c)] : c; +} + +/* Allocate and initialize a keyword set object, returning an opaque + pointer to it. */ +kwset_t +kwsalloc (char const *trans) +{ + struct kwset *kwset = xmalloc (sizeof *kwset); + + obstack_init (&kwset->obstack); + kwset->words = 0; + kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie); + kwset->trie->accepting = 0; + kwset->trie->links = NULL; + kwset->trie->parent = NULL; + kwset->trie->next = NULL; + kwset->trie->fail = NULL; + kwset->trie->depth = 0; + kwset->trie->shift = 0; + kwset->mind = INT_MAX; + kwset->maxd = -1; + kwset->target = NULL; + kwset->trans = trans; + + return kwset; +} + +/* This upper bound is valid for CHAR_BIT >= 4 and + exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ +#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) + +/* Add the given string to the contents of the keyword set. */ +void +kwsincr (kwset_t kwset, char const *text, size_t len) +{ + struct trie *trie = kwset->trie; + char const *trans = kwset->trans; + + text += len; + + /* Descend the trie (built of reversed keywords) character-by-character, + installing new nodes when necessary. */ + while (len--) + { + unsigned char uc = *--text; + unsigned char label = trans ? trans[uc] : uc; + + /* Descend the tree of outgoing links for this trie node, + looking for the current character and keeping track + of the path followed. */ + struct tree *link = trie->links; + struct tree *links[DEPTH_SIZE]; + enum { L, R } dirs[DEPTH_SIZE]; + links[0] = (struct tree *) &trie->links; + dirs[0] = L; + int depth = 1; + + while (link && label != link->label) + { + links[depth] = link; + if (label < link->label) + dirs[depth++] = L, link = link->llink; + else + dirs[depth++] = R, link = link->rlink; + } + + /* The current character doesn't have an outgoing link at + this trie node, so build a new trie node and install + a link in the current trie node's tree. */ + if (!link) + { + link = obstack_alloc (&kwset->obstack, sizeof *link); + link->llink = NULL; + link->rlink = NULL; + link->trie = obstack_alloc (&kwset->obstack, sizeof *link->trie); + link->trie->accepting = 0; + link->trie->links = NULL; + link->trie->parent = trie; + link->trie->next = NULL; + link->trie->fail = NULL; + link->trie->depth = trie->depth + 1; + link->trie->shift = 0; + link->label = label; + link->balance = 0; + + /* Install the new tree node in its parent. */ + if (dirs[--depth] == L) + links[depth]->llink = link; + else + links[depth]->rlink = link; + + /* Back up the tree fixing the balance flags. */ + while (depth && !links[depth]->balance) + { + if (dirs[depth] == L) + --links[depth]->balance; + else + ++links[depth]->balance; + --depth; + } + + /* Rebalance the tree by pointer rotations if necessary. */ + if (depth && ((dirs[depth] == L && --links[depth]->balance) + || (dirs[depth] == R && ++links[depth]->balance))) + { + struct tree *t, *r, *l, *rl, *lr; + + switch (links[depth]->balance) + { + case (char) -2: + switch (dirs[depth + 1]) + { + case L: + r = links[depth], t = r->llink, rl = t->rlink; + t->rlink = r, r->llink = rl; + t->balance = r->balance = 0; + break; + case R: + r = links[depth], l = r->llink, t = l->rlink; + rl = t->rlink, lr = t->llink; + t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; + l->balance = t->balance != 1 ? 0 : -1; + r->balance = t->balance != (char) -1 ? 0 : 1; + t->balance = 0; + break; + default: + abort (); + } + break; + case 2: + switch (dirs[depth + 1]) + { + case R: + l = links[depth], t = l->rlink, lr = t->llink; + t->llink = l, l->rlink = lr; + t->balance = l->balance = 0; + break; + case L: + l = links[depth], r = l->rlink, t = r->llink; + lr = t->llink, rl = t->rlink; + t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; + l->balance = t->balance != 1 ? 0 : -1; + r->balance = t->balance != (char) -1 ? 0 : 1; + t->balance = 0; + break; + default: + abort (); + } + break; + default: + abort (); + } + + if (dirs[depth - 1] == L) + links[depth - 1]->llink = t; + else + links[depth - 1]->rlink = t; + } + } + + trie = link->trie; + } + + /* Mark the node we finally reached as accepting, encoding the + index number of this word in the keyword set so far. */ + if (!trie->accepting) + trie->accepting = 1 + 2 * kwset->words; + ++kwset->words; + + /* Keep track of the longest and shortest string of the keyword set. */ + if (trie->depth < kwset->mind) + kwset->mind = trie->depth; + if (trie->depth > kwset->maxd) + kwset->maxd = trie->depth; +} + +/* Enqueue the trie nodes referenced from the given tree in the + given queue. */ +static void +enqueue (struct tree *tree, struct trie **last) +{ + if (!tree) + return; + enqueue(tree->llink, last); + enqueue(tree->rlink, last); + (*last) = (*last)->next = tree->trie; +} + +/* Compute the Aho-Corasick failure function for the trie nodes referenced + from the given tree, given the failure function for their parent as + well as a last resort failure node. */ +static void +treefails (struct tree const *tree, struct trie const *fail, + struct trie *recourse) +{ + struct tree *link; + + if (!tree) + return; + + treefails(tree->llink, fail, recourse); + treefails(tree->rlink, fail, recourse); + + /* Find, in the chain of fails going back to the root, the first + node that has a descendant on the current label. */ + while (fail) + { + link = fail->links; + while (link && tree->label != link->label) + if (tree->label < link->label) + link = link->llink; + else + link = link->rlink; + if (link) + { + tree->trie->fail = link->trie; + return; + } + fail = fail->fail; + } + + tree->trie->fail = recourse; +} + +/* Set delta entries for the links of the given tree such that + the preexisting delta value is larger than the current depth. */ +static void +treedelta (struct tree const *tree, + unsigned int depth, + unsigned char delta[]) +{ + if (!tree) + return; + treedelta(tree->llink, depth, delta); + treedelta(tree->rlink, depth, delta); + if (depth < delta[tree->label]) + delta[tree->label] = depth; +} + +/* Return true if A has every label in B. */ +static int _GL_ATTRIBUTE_PURE +hasevery (struct tree const *a, struct tree const *b) +{ + if (!b) + return 1; + if (!hasevery(a, b->llink)) + return 0; + if (!hasevery(a, b->rlink)) + return 0; + while (a && b->label != a->label) + if (b->label < a->label) + a = a->llink; + else + a = a->rlink; + return !!a; +} + +/* Compute a vector, indexed by character code, of the trie nodes + referenced from the given tree. */ +static void +treenext (struct tree const *tree, struct trie *next[]) +{ + if (!tree) + return; + treenext(tree->llink, next); + treenext(tree->rlink, next); + next[tree->label] = tree->trie; +} + +/* Compute the shift for each trie node, as well as the delta + table and next cache for the given keyword set. */ +void +kwsprep (kwset_t kwset) +{ + char const *trans = kwset->trans; + int i; + unsigned char deltabuf[NCHAR]; + unsigned char *delta = trans ? deltabuf : kwset->delta; + + /* Initial values for the delta table; will be changed later. The + delta entry for a given character is the smallest depth of any + node at which an outgoing edge is labeled by that character. */ + memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf); + + /* Traverse the nodes of the trie in level order, simultaneously + computing the delta table, failure function, and shift function. */ + struct trie *curr, *last; + for (curr = last = kwset->trie; curr; curr = curr->next) + { + /* Enqueue the immediate descendants in the level order queue. */ + enqueue (curr->links, &last); + + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the delta table for the descendants of this node. */ + treedelta (curr->links, curr->depth, delta); + + /* Compute the failure function for the descendants of this node. */ + treefails (curr->links, curr->fail, kwset->trie); + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + struct trie *fail; + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery (fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } + } + + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } + + /* Create a vector, indexed by character code, of the outgoing links + from the root node. */ + struct trie *nextbuf[NCHAR]; + struct trie **next = trans ? nextbuf : kwset->next; + memset (next, 0, sizeof nextbuf); + treenext (kwset->trie->links, next); + if (trans) + for (i = 0; i < NCHAR; ++i) + kwset->next[i] = next[U(trans[i])]; + + /* Check if we can use the simple boyer-moore algorithm, instead + of the hairy commentz-walter algorithm. */ + if (kwset->words == 1) + { + /* Looking for just one string. Extract it from the trie. */ + kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); + for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) + { + kwset->target[i] = curr->links->label; + curr = curr->next; + } + /* Looking for the delta2 shift that we might make after a + backwards match has failed. Extract it from the trie. */ + if (kwset->mind > 1) + { + kwset->shift + = obstack_alloc (&kwset->obstack, + sizeof *kwset->shift * (kwset->mind - 1)); + for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) + { + kwset->shift[i] = curr->shift; + curr = curr->next; + } + } + + char gc1 = tr (trans, kwset->target[kwset->mind - 1]); + + /* Set GC1HELP according to whether exactly one, exactly two, or + three-or-more characters match GC1. */ + int gc1help = -1; + if (trans) + { + char const *equiv1 = memchr (trans, gc1, NCHAR); + char const *equiv2 = memchr (equiv1 + 1, gc1, + trans + NCHAR - (equiv1 + 1)); + if (equiv2) + gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1)) + ? NCHAR + : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans)); + } + + kwset->gc1 = gc1; + kwset->gc1help = gc1help; + if (kwset->mind > 1) + kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]); + } + + /* Fix things up for any translation table. */ + if (trans) + for (i = 0; i < NCHAR; ++i) + kwset->delta[i] = delta[U(trans[i])]; +} + +/* Delta2 portion of a Boyer-Moore search. *TP is the string text + pointer; it is updated in place. EP is the end of the string text, + and SP the end of the pattern. LEN is the pattern length; it must + be at least 2. TRANS, if nonnull, is the input translation table. + GC1 and GC2 are the last and second-from last bytes of the pattern, + transliterated by TRANS; the caller precomputes them for + efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP + when failing. KWSET->shift says how much to shift. */ +static inline bool +bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len, + char const *trans, char gc1, char gc2, + unsigned char const *d1, kwset_t kwset) +{ + char const *tp = *tpp; + int d = len, skip = 0; + + while (true) + { + int i = 2; + if (tr (trans, tp[-2]) == gc2) + { + while (++i <= d) + if (tr (trans, tp[-i]) != tr (trans, sp[-i])) + break; + if (i > d) + { + for (i = d + skip + 1; i <= len; ++i) + if (tr (trans, tp[-i]) != tr (trans, sp[-i])) + break; + if (i > len) + { + *tpp = tp - len; + return true; + } + } + } + + tp += d = kwset->shift[i - 2]; + if (tp > ep) + break; + if (tr (trans, tp[-1]) != gc1) + { + if (d1) + tp += d1[U(tp[-1])]; + break; + } + skip = i - 1; + } + + *tpp = tp; + return false; +} + +/* Return the address of the first byte in the buffer S (of size N) + that matches the last byte specified by KWSET, a singleton. */ +static char const * +memchr_kwset (char const *s, size_t n, kwset_t kwset) +{ + if (kwset->gc1help < 0) + return memchr (s, kwset->gc1, n); + int small_heuristic = 2; + int small = (- (uintptr_t) s % sizeof (long) + + small_heuristic * sizeof (long)); + size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n; + char const *slim = s + ntrans; + for (; s < slim; s++) + if (kwset->trans[U(*s)] == kwset->gc1) + return s; + n -= ntrans; + return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n); +} + +/* Fast Boyer-Moore search (inlinable version). */ +static inline size_t _GL_ATTRIBUTE_PURE +bmexec_trans (kwset_t kwset, char const *text, size_t size) +{ + unsigned char const *d1; + char const *ep, *sp, *tp; + int d; + int len = kwset->mind; + char const *trans = kwset->trans; + + if (len == 0) + return 0; + if (len > size) + return -1; + if (len == 1) + { + tp = memchr_kwset (text, size, kwset); + return tp ? tp - text : -1; + } + + d1 = kwset->delta; + sp = kwset->target + len; + tp = text + len; + char gc1 = kwset->gc1; + char gc2 = kwset->gc2; + + /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ + if (size > 12 * len) + /* 11 is not a bug, the initial offset happens only once. */ + for (ep = text + size - 11 * len; tp <= ep; ) + { + char const *tp0 = tp; + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + if (d != 0) + { + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + if (d != 0) + { + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + if (d != 0) + { + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + + /* As a heuristic, prefer memchr to seeking by + delta1 when the latter doesn't advance much. */ + int advance_heuristic = 16 * sizeof (long); + if (advance_heuristic <= tp - tp0) + continue; + tp--; + tp = memchr_kwset (tp, text + size - tp, kwset); + if (! tp) + return -1; + tp++; + if (ep <= tp) + break; + } + } + } + if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset)) + return tp - text; + } + + /* Now we have only a few characters left to search. We + carefully avoid ever producing an out-of-bounds pointer. */ + ep = text + size; + d = d1[U(tp[-1])]; + while (d <= ep - tp) + { + d = d1[U((tp += d)[-1])]; + if (d != 0) + continue; + if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset)) + return tp - text; + } + + return -1; +} + +/* Fast Boyer-Moore search. */ +static size_t +bmexec (kwset_t kwset, char const *text, size_t size) +{ + /* Help the compiler inline bmexec_trans in two ways, depending on + whether kwset->trans is null. */ + return (kwset->trans + ? bmexec_trans (kwset, text, size) + : bmexec_trans (kwset, text, size)); +} + +/* Hairy multiple string search. */ +static size_t _GL_ARG_NONNULL ((4)) +cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) +{ + struct trie * const *next; + struct trie const *trie; + struct trie const *accept; + char const *beg, *lim, *mch, *lmch; + unsigned char c; + unsigned char const *delta; + int d; + char const *end, *qlim; + struct tree const *tree; + char const *trans; + +#ifdef lint + accept = NULL; +#endif + + /* Initialize register copies and look for easy ways out. */ + if (len < kwset->mind) + return -1; + next = kwset->next; + delta = kwset->delta; + trans = kwset->trans; + lim = text + len; + end = text; + if ((d = kwset->mind) != 0) + mch = NULL; + else + { + mch = text, accept = kwset->trie; + goto match; + } + + if (len >= 4 * kwset->mind) + qlim = lim - 4 * kwset->mind; + else + qlim = NULL; + + while (lim - end >= d) + { + if (qlim && end <= qlim) + { + end += d - 1; + while ((d = delta[c = *end]) && end < qlim) + { + end += d; + end += delta[U(*end)]; + end += delta[U(*end)]; + } + ++end; + } + else + d = delta[c = (end += d)[-1]]; + if (d) + continue; + beg = end - 1; + trie = next[c]; + if (trie->accepting) + { + mch = beg; + accept = trie; + } + d = trie->shift; + while (beg > text) + { + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) + { + trie = tree->trie; + if (trie->accepting) + { + mch = beg; + accept = trie; + } + } + else + break; + d = trie->shift; + } + if (mch) + goto match; + } + return -1; + + match: + /* Given a known match, find the longest possible match anchored + at or before its starting point. This is nearly a verbatim + copy of the preceding main search loops. */ + if (lim - mch > kwset->maxd) + lim = mch + kwset->maxd; + lmch = 0; + d = 1; + while (lim - end >= d) + { + if ((d = delta[c = (end += d)[-1]]) != 0) + continue; + beg = end - 1; + if (!(trie = next[c])) + { + d = 1; + continue; + } + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } + d = trie->shift; + while (beg > text) + { + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) + { + trie = tree->trie; + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } + } + else + break; + d = trie->shift; + } + if (lmch) + { + mch = lmch; + goto match; + } + if (!d) + d = 1; + } + + kwsmatch->index = accept->accepting / 2; + kwsmatch->offset[0] = mch - text; + kwsmatch->size[0] = accept->depth; + + return mch - text; +} + +/* Search TEXT for a match of any member of KWSET. + Return the offset (into TEXT) of the first byte of the matching substring, + or (size_t) -1 if no match is found. Upon a match, store details in + *KWSMATCH: index of matched keyword, start offset (same as the return + value), and length. */ +size_t +kwsexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) +{ + if (kwset->words == 1) + { + size_t ret = bmexec (kwset, text, size); + if (ret != (size_t) -1) + { + kwsmatch->index = 0; + kwsmatch->offset[0] = ret; + kwsmatch->size[0] = kwset->mind; + } + return ret; + } + else + return cwexec (kwset, text, size, kwsmatch); +} + +/* Free the components of the given keyword set. */ +void +kwsfree (kwset_t kwset) +{ + obstack_free (&kwset->obstack, NULL); + free (kwset); +} diff --git a/src/kwset.h b/src/kwset.h new file mode 100644 index 0000000..95693e3 --- /dev/null +++ b/src/kwset.h @@ -0,0 +1,60 @@ +/* kwset.h - header declaring the keyword set library. + Copyright (C) 1989, 1998, 2005, 2007, 2009-2016 Free Software Foundation, + Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1989 by Mike Haertel. + The author may be reached (Email) at the address mike@ai.mit.edu, + or (US mail) as Mike Haertel c/o Free Software Foundation. */ + +#include + +struct kwsmatch +{ + size_t index; /* Index number of matching keyword. */ + size_t offset[1]; /* Offset of each submatch. */ + size_t size[1]; /* Length of each submatch. */ +}; + +#include "arg-nonnull.h" + +struct kwset; +typedef struct kwset *kwset_t; + +/* Return an opaque pointer to a newly allocated keyword set. A nonnull arg + specifies a table of character translations to be applied to all + pattern and search text. */ +extern kwset_t kwsalloc (char const *); + +/* Incrementally extend the keyword set to include the given string. + Remember an index number for each keyword included in the set. */ +extern void kwsincr (kwset_t, char const *, size_t); + +/* When the keyword set has been completely built, prepare it for use. */ +extern void kwsprep (kwset_t); + +/* Search through the given buffer for a member of the keyword set. + Return a pointer to the leftmost longest match found, or NULL if + no match is found. If foundlen is non-NULL, store the length of + the matching substring in the integer it points to. Similarly, + if foundindex is non-NULL, store the index of the particular + keyword found therein. */ +extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *) + _GL_ARG_NONNULL ((4)); + +/* Deallocate the given keyword set and all its associated storage. */ +extern void kwsfree (kwset_t); diff --git a/src/pcresearch.c b/src/pcresearch.c new file mode 100644 index 0000000..f6e72b0 --- /dev/null +++ b/src/pcresearch.c @@ -0,0 +1,389 @@ +/* pcresearch.c - searching subroutines using PCRE for grep. + Copyright 2000, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include +#include "search.h" + +#if HAVE_LIBPCRE +# include + +/* This must be at least 2; everything after that is for performance + in pcre_exec. */ +enum { NSUB = 300 }; + +/* Compiled internal form of a Perl regular expression. */ +static pcre *cre; + +/* Additional information about the pattern. */ +static pcre_extra *extra; + +# ifndef PCRE_STUDY_JIT_COMPILE +# define PCRE_STUDY_JIT_COMPILE 0 +# endif + +# if PCRE_STUDY_JIT_COMPILE +/* Maximum size of the JIT stack. */ +static int jit_stack_size; +# endif + +/* Match the already-compiled PCRE pattern against the data in SUBJECT, + of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with + options OPTIONS, and storing resulting matches into SUB. Return + the (nonnegative) match location or a (negative) error number. */ +static int +jit_exec (char const *subject, int search_bytes, int search_offset, + int options, int *sub) +{ + while (true) + { + int e = pcre_exec (cre, extra, subject, search_bytes, search_offset, + options, sub, NSUB); + +# if PCRE_STUDY_JIT_COMPILE + if (e == PCRE_ERROR_JIT_STACKLIMIT + && 0 < jit_stack_size && jit_stack_size <= INT_MAX / 2) + { + int old_size = jit_stack_size; + int new_size = jit_stack_size = old_size * 2; + static pcre_jit_stack *jit_stack; + if (jit_stack) + pcre_jit_stack_free (jit_stack); + jit_stack = pcre_jit_stack_alloc (old_size, new_size); + if (!jit_stack) + error (EXIT_TROUBLE, 0, + _("failed to allocate memory for the PCRE JIT stack")); + pcre_assign_jit_stack (extra, NULL, jit_stack); + continue; + } +# endif + + return e; + } +} + +#endif + +#if HAVE_LIBPCRE +/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty + string matches when that flag is used. */ +static int empty_match[2]; + +static bool multibyte_locale; +#endif + +void +Pcompile (char const *pattern, size_t size) +{ +#if !HAVE_LIBPCRE + error (EXIT_TROUBLE, 0, "%s", + _("support for the -P option is not compiled into " + "this --disable-perl-regexp binary")); +#else + int e; + char const *ep; + static char const wprefix[] = "(? + +#include +#include +#include +#include +#include + +#include "system.h" +#include "error.h" +#include "grep.h" +#include "dfa.h" +#include "kwset.h" +#include "xalloc.h" + +_GL_INLINE_HEADER_BEGIN +#ifndef SEARCH_INLINE +# define SEARCH_INLINE _GL_INLINE +#endif + +/* This must be a signed type. Each value is the difference in the size + of a character (in bytes) induced by converting to lower case. + The vast majority of values are 0, but a few are 1 or -1, so + technically, two bits may be sufficient. */ +typedef signed char mb_len_map_t; + +/* searchutils.c */ +extern void kwsinit (kwset_t *); + +extern void build_mbclen_cache (void); +extern size_t mbclen_cache[]; +extern ptrdiff_t mb_goback (char const **, char const *, char const *); +extern wint_t mb_prev_wc (char const *, char const *, char const *); +extern wint_t mb_next_wc (char const *, char const *); + +/* dfasearch.c */ +extern void GEAcompile (char const *, size_t, reg_syntax_t); +extern size_t EGexecute (char *, size_t, size_t *, char const *); + +/* kwsearch.c */ +extern void Fcompile (char const *, size_t); +extern size_t Fexecute (char *, size_t, size_t *, char const *); + +/* pcresearch.c */ +extern void Pcompile (char const *, size_t); +extern size_t Pexecute (char *, size_t, size_t *, char const *); + +/* Return the number of bytes in the character at the start of S, which + is of size N. N must be positive. MBS is the conversion state. + This acts like mbrlen, except it returns 1 when mbrlen would return 0, + and it is typically faster because of the cache. */ +SEARCH_INLINE size_t +mb_clen (char const *s, size_t n, mbstate_t *mbs) +{ + size_t len = mbclen_cache[to_uchar (*s)]; + return len == (size_t) -2 ? mbrlen (s, n, mbs) : len; +} + +_GL_INLINE_HEADER_END + +#endif /* GREP_SEARCH_H */ diff --git a/src/searchutils.c b/src/searchutils.c new file mode 100644 index 0000000..1f21a0e --- /dev/null +++ b/src/searchutils.c @@ -0,0 +1,127 @@ +/* searchutils.c - helper subroutines for grep's matchers. + Copyright 1992, 1998, 2000, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#include + +#define SEARCH_INLINE _GL_EXTERN_INLINE +#define SYSTEM_INLINE _GL_EXTERN_INLINE +#include "search.h" + +#define NCHAR (UCHAR_MAX + 1) + +size_t mbclen_cache[NCHAR]; + +void +kwsinit (kwset_t *kwset) +{ + static char trans[NCHAR]; + int i; + + if (match_icase && MB_CUR_MAX == 1) + { + for (i = 0; i < NCHAR; ++i) + trans[i] = toupper (i); + + *kwset = kwsalloc (trans); + } + else + *kwset = kwsalloc (NULL); + + if (!*kwset) + xalloc_die (); +} + +/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ +void +build_mbclen_cache (void) +{ + int i; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t mbs = { 0 }; + size_t len = mbrlen (&c, 1, &mbs); + mbclen_cache[uc] = len ? len : 1; + } +} + +/* In the buffer *MB_START, return the number of bytes needed to go + back from CUR to the previous boundary, where a "boundary" is the + start of a multibyte character or is an error-encoding byte. The + buffer ends at END (i.e., one past the address of the buffer's last + byte). If CUR is already at a boundary, return 0. If *MB_START is + greater than or equal to CUR, return the negative value CUR - *MB_START. + + When returning zero, set *MB_START to CUR. When returning a + positive value, set *MB_START to the next boundary after CUR, or to + END if there is no such boundary. When returning a negative value, + leave *MB_START alone. */ +ptrdiff_t +mb_goback (char const **mb_start, char const *cur, char const *end) +{ + const char *p = *mb_start; + const char *p0 = p; + mbstate_t cur_state; + + memset (&cur_state, 0, sizeof cur_state); + + while (p < cur) + { + size_t clen = mb_clen (p, end - p, &cur_state); + + if ((size_t) -2 <= clen) + { + /* An invalid sequence, or a truncated multibyte character. + Treat it as a single byte character. */ + clen = 1; + memset (&cur_state, 0, sizeof cur_state); + } + p0 = p; + p += clen; + } + + *mb_start = p; + return p == cur ? 0 : cur - p0; +} + +/* In the buffer BUF, return the wide character that is encoded just + before CUR. The buffer ends at END. Return WEOF if there is no + wide character just before CUR. */ +wint_t +mb_prev_wc (char const *buf, char const *cur, char const *end) +{ + if (cur == buf) + return WEOF; + char const *p = buf; + cur--; + cur -= mb_goback (&p, cur, end); + return mb_next_wc (cur, end); +} + +/* Return the wide character that is encoded at CUR. The buffer ends + at END. Return WEOF if there is no wide character encoded at CUR. */ +wint_t +mb_next_wc (char const *cur, char const *end) +{ + wchar_t wc; + mbstate_t mbs = { 0 }; + return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 + ? wc : WEOF); +} diff --git a/src/system.h b/src/system.h new file mode 100644 index 0000000..6f4918d --- /dev/null +++ b/src/system.h @@ -0,0 +1,110 @@ +/* Portability cruft. Include after config.h and sys/types.h. + Copyright 1996, 1998-2000, 2007, 2009-2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#ifndef GREP_SYSTEM_H +#define GREP_SYSTEM_H 1 + +#include +#include +#include + +#include "binary-io.h" +#include "configmake.h" +#include "dirname.h" +#include "ignore-value.h" +#include "minmax.h" +#include "same-inode.h" + +#include +#include +#include +#include +#include + +enum { EXIT_TROUBLE = 2 }; + +#include +#define N_(String) gettext_noop(String) +#define _(String) gettext(String) + +#include + +#ifndef initialize_main +# define initialize_main(argcp, argvp) +#endif + +#include "unlocked-io.h" + +_GL_INLINE_HEADER_BEGIN +#ifndef SYSTEM_INLINE +# define SYSTEM_INLINE _GL_INLINE +#endif + +#define STREQ(a, b) (strcmp (a, b) == 0) + +/* Convert a possibly-signed character to an unsigned character. This is + a bit safer than casting to unsigned char, since it catches some type + errors that the cast doesn't. */ +SYSTEM_INLINE unsigned char +to_uchar (char ch) +{ + return ch; +} + +_GL_INLINE_HEADER_END + +#ifndef __has_feature +# define __has_feature(F) false +#endif + +#if defined __SANITIZE_ADDRESS__ || __has_feature (address_sanitizer) +# define HAVE_ASAN 1 +#else +# define HAVE_ASAN 0 +#endif + +#if HAVE_ASAN + +/* Mark memory region [addr, addr+size) as unaddressable. + This memory must be previously allocated by the user program. Accessing + addresses in this region from instrumented code is forbidden until + this region is unpoisoned. This function is not guaranteed to poison + the whole region - it may poison only a subregion of [addr, addr+size) + due to ASan alignment restrictions. + Method is NOT thread-safe in the sense that no two threads can + (un)poison memory in the same memory region simultaneously. */ +void __asan_poison_memory_region (void const volatile *addr, size_t size); + +/* Mark memory region [addr, addr+size) as addressable. + This memory must be previously allocated by the user program. Accessing + addresses in this region is allowed until this region is poisoned again. + This function may unpoison a superregion of [addr, addr+size) due to + ASan alignment restrictions. + Method is NOT thread-safe in the sense that no two threads can + (un)poison memory in the same memory region simultaneously. */ +void __asan_unpoison_memory_region (void const volatile *addr, size_t size); + +#else + +static _GL_UNUSED void +__asan_poison_memory_region (void const volatile *addr, size_t size) { } +static _GL_UNUSED void +__asan_unpoison_memory_region (void const volatile *addr, size_t size) { } +#endif + +#endif -- cgit v1.2.1