summaryrefslogtreecommitdiff
path: root/gcc/ada/g-regpat.ads
blob: 5489f0f362dcb50fd9edc247b15089b215a49f22 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
------------------------------------------------------------------------------
--                                                                          --
--                         GNAT LIBRARY COMPONENTS                          --
--                                                                          --
--                          G N A T . R E G P A T                           --
--                                                                          --
--                                 S p e c                                  --
--                                                                          --
--                                                                          --
--               Copyright (C) 1986 by University of Toronto.               --
--           Copyright (C) 1996-2001 Ada Core Technologies, Inc.            --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
-- for  more details.  You should have  received  a copy of the GNU General --
-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
-- MA 02111-1307, USA.                                                      --
--                                                                          --
-- As a special exception,  if other files  instantiate  generics from this --
-- unit, or you link  this unit with other files  to produce an executable, --
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
-- covered  by the  GNU  General  Public  License.  This exception does not --
-- however invalidate  any other reasons why  the executable file  might be --
-- covered by the  GNU Public License.                                      --
--                                                                          --
-- GNAT is maintained by Ada Core Technologies Inc (http://www.gnat.com).   --
--                                                                          --
------------------------------------------------------------------------------

--  This package implements roughly the same set of regular expressions as
--  are available in the Perl or Python programming languages.

--  This is an extension of the original V7 style regular expression library
--  written in C by Henry Spencer. Apart from the translation to Ada, the
--  interface has been considerably changed to use the Ada String type
--  instead of C-style nul-terminated strings.

------------------------------------------------------------
-- Summary of Pattern Matching Packages in GNAT Hierarchy --
------------------------------------------------------------

--  There are three related packages that perform pattern maching functions.
--  the following is an outline of these packages, to help you determine
--  which is best for your needs.

--     GNAT.Regexp (files g-regexp.ads/g-regexp.adb)
--       This is a simple package providing Unix-style regular expression
--       matching with the restriction that it matches entire strings. It
--       is particularly useful for file name matching, and in particular
--       it provides "globbing patterns" that are useful in implementing
--       unix or DOS style wild card matching for file names.

--     GNAT.Regpat (files g-regpat.ads/g-regpat.adb)
--       This is a more complete implementation of Unix-style regular
--       expressions, copied from the Perl regular expression engine,
--       written originally in C by Henry Spencer. It is functionally the
--       same as that library.

--     GNAT.Spitbol.Patterns (files g-spipat.ads/g-spipat.adb)
--       This is a completely general pattern matching package based on the
--       pattern language of SNOBOL4, as implemented in SPITBOL. The pattern
--       language is modeled on context free grammars, with context sensitive
--       extensions that provide full (type 0) computational capabilities.

package GNAT.Regpat is
pragma Preelaborate (Regpat);

   --  The grammar is the following:

   --     regexp ::= expr
   --            ::= ^ expr               -- anchor at the beginning of string
   --            ::= expr $               -- anchor at the end of string
   --     expr   ::= term
   --            ::= term | term          -- alternation (term or term ...)
   --     term   ::= item
   --            ::= item item ...        -- concatenation (item then item)
   --     item   ::= elmt                 -- match elmt
   --            ::= elmt *               -- zero or more elmt's
   --            ::= elmt +               -- one or more elmt's
   --            ::= elmt ?               -- matches elmt or nothing
   --            ::= elmt *?              -- zero or more times, minimum number
   --            ::= elmt +?              -- one or more times, minimum number
   --            ::= elmt ??              -- zero or one time, minimum number
   --            ::= elmt { num }         -- matches elmt exactly num times
   --            ::= elmt { num , }       -- matches elmt at least num times
   --            ::= elmt { num , num2 }  -- matches between num and num2 times
   --            ::= elmt { num }?        -- matches elmt exactly num times
   --            ::= elmt { num , }?      -- matches elmt at least num times
   --                                        non-greedy version
   --            ::= elmt { num , num2 }? -- matches between num and num2 times
   --                                        non-greedy version
   --     elmt   ::= nchr                 -- matches given character
   --            ::= [range range ...]    -- matches any character listed
   --            ::= [^ range range ...]  -- matches any character not listed
   --            ::= .                    -- matches any single character
   --                                     -- except newlines
   --            ::= ( expr )             -- parens used for grouping
   --            ::= \ num                -- reference to num-th parenthesis
   --     range  ::= char - char          -- matches chars in given range
   --            ::= nchr
   --            ::= [: posix :]          -- any character in the POSIX range
   --            ::= [:^ posix :]         -- not in the POSIX range
   --     posix  ::= alnum                -- alphanumeric characters
   --            ::= alpha                -- alphabetic characters
   --            ::= ascii                -- ascii characters (0 .. 127)
   --            ::= cntrl                -- control chars (0..31, 127..159)
   --            ::= digit                -- digits ('0' .. '9')
   --            ::= graph                -- graphic chars (32..126, 160..255)
   --            ::= lower                -- lower case characters
   --            ::= print                -- printable characters (32..127)
   --            ::= punct                -- printable, except alphanumeric
   --            ::= space                -- space characters
   --            ::= upper                -- upper case characters
   --            ::= word                 -- alphanumeric characters
   --            ::= xdigit               -- hexadecimal chars (0..9, a..f)

   --     char   ::= any character, including special characters
   --                ASCII.NUL is not supported.
   --     nchr   ::= any character except \()[].*+?^ or \char to match char
   --                \n means a newline (ASCII.LF)
   --                \t means a tab (ASCII.HT)
   --                \r means a return (ASCII.CR)
   --                \b matches the empty string at the beginning or end of a
   --                   word. A word is defined as a set of alphanumerical
   --                   characters (see \w below).
   --                \B matches the empty string only when *not* at the
   --                   beginning or end of a word.
   --                \d matches any digit character ([0-9])
   --                \D matches any non digit character ([^0-9])
   --                \s matches any white space character. This is equivalent
   --                   to [ \t\n\r\f\v]  (tab, form-feed, vertical-tab,...
   --                \S matches any non-white space character.
   --                \w matches any alphanumeric character or underscore.
   --                   This include accented letters, as defined in the
   --                   package Ada.Characters.Handling.
   --                \W matches any non-alphanumeric character.
   --                \A match the empty string only at the beginning of the
   --                   string, whatever flags are used for Compile (the
   --                   behavior of ^ can change, see Regexp_Flags below).
   --                \G match the empty string only at the end of the
   --                   string, whatever flags are used for Compile (the
   --                   behavior of $ can change, see Regexp_Flags below).
   --     ...    ::= is used to indication repetition (one or more terms)

   --  Embedded newlines are not matched by the ^ operator.
   --  It is possible to retrieve the substring matched a parenthesis
   --  expression. Although the depth of parenthesis is not limited in the
   --  regexp, only the first 9 substrings can be retrieved.

   --  The highest value possible for the arguments to the curly operator ({})
   --  are given by the constant Max_Curly_Repeat below.

   --  The operators '*', '+', '?' and '{}' always match the longest possible
   --  substring. They all have a non-greedy version (with an extra ? after the
   --  operator), which matches the shortest possible substring.

   --  For instance:
   --      regexp="<.*>"   string="<h1>title</h1>"   matches="<h1>title</h1>"
   --      regexp="<.*?>"  string="<h1>title</h1>"   matches="<h1>"
   --
   --  '{' and '}' are only considered as special characters if they appear
   --  in a substring that looks exactly like '{n}', '{n,m}' or '{n,}', where
   --  n and m are digits. No space is allowed. In other contexts, the curly
   --  braces will simply be treated as normal characters.

   --  Compiling Regular Expressions
   --  =============================

   --  To use this package, you first need to compile the regular expression
   --  (a string) into a byte-code program, in a Pattern_Matcher structure.
   --  This first step checks that the regexp is valid, and optimizes the
   --  matching algorithms of the second step.

   --  Two versions of the Compile subprogram are given: one in which this
   --  package will compute itself the best possible size to allocate for the
   --  byte code; the other where you must allocate enough memory yourself. An
   --  exception is raised if there is not enough memory.

   --     declare
   --        Regexp : String := "a|b";

   --        Matcher : Pattern_Matcher := Compile (Regexp);
   --        --  The size for matcher is automatically allocated

   --        Matcher2 : Pattern_Matcher (1000);
   --        --  Some space is allocated directly.

   --     begin
   --        Compile (Matcher2, Regexp);
   --        ...
   --     end;

   --  Note that the second version is significantly faster, since with the
   --  first version the regular expression has in fact to be compiled twice
   --  (first to compute the size, then to generate the byte code).

   --  Note also that you can not use the function version of Compile if you
   --  specify the size of the Pattern_Matcher, since the discriminants will
   --  most probably be different and you will get a Constraint_Error

   --  Matching Strings
   --  ================

   --  Once the regular expression has been compiled, you can use it as often
   --  as needed to match strings.

   --  Several versions of the Match subprogram are provided, with different
   --  parameters and return results.

   --  See the description under each of these subprograms.

   --  Here is a short example showing how to get the substring matched by
   --  the first parenthesis pair.

   --     declare
   --        Matches : Match_Array;
   --        Regexp  : String := "a(b|c)d";
   --        Str     : String := "gacdg";

   --     begin
   --        Match (Compile (Regexp), Str, Matches);
   --        return Str (Matches (1).First .. Matches (1).Last);
   --        --  returns 'c'
   --     end;

   --  String Substitution
   --  ===================

   --  No subprogram is currently provided for string substitution.
   --  However, this is easy to simulate with the parenthesis groups, as
   --  shown below.

   --  This example swaps the first two words of the string:

   --     declare
   --        Regexp  : String := "([a-z]+) +([a-z]+)";
   --        Str     : String := " first   second third ";
   --        Matches : Match_Array;

   --     begin
   --        Match (Compile (Regexp), Str, Matches);
   --        return Str (Str'First .. Matches (1).First - 1)
   --               & Str (Matches (2).First .. Matches (2).Last)
   --               & " "
   --               & Str (Matches (1).First .. Matches (1).Last)
   --               & Str (Matches (2).Last + 1 .. Str'Last);
   --        --  returns " second first third "
   --     end;

   ---------------
   -- Constants --
   ---------------

   Expression_Error : exception;
   --  This exception is raised when trying to compile an invalid
   --  regular expression. All subprograms taking an expression
   --  as parameter may raise Expression_Error.

   Max_Paren_Count : constant := 255;
   --  Maximum number of parenthesis in a regular expression.
   --  This is limited by the size of a Character, as found in the
   --  byte-compiled version of regular expressions.

   Max_Program_Size : constant := 2**15 - 1;
   --  Maximum size that can be allocated for a program.

   Max_Curly_Repeat : constant := 32767;
   --  Maximum number of repetition for the curly operator.
   --  The digits in the {n}, {n,} and {n,m } operators can not be higher
   --  than this constant, since they have to fit on two characters in the
   --  byte-compiled version of regular expressions.

   type Program_Size is range 0 .. Max_Program_Size;
   for Program_Size'Size use 16;
   --  Number of bytes allocated for the byte-compiled version of a regular
   --  expression.

   type Regexp_Flags is mod 256;
   for Regexp_Flags'Size use 8;
   --  Flags that can be given at compile time to specify default
   --  properties for the regular expression.

   No_Flags         : constant Regexp_Flags;
   Case_Insensitive : constant Regexp_Flags;
   --  The automaton is optimized so that the matching is done in a case
   --  insensitive manner (upper case characters and lower case characters
   --  are all treated the same way).

   Single_Line      : constant Regexp_Flags;
   --  Treat the Data we are matching as a single line. This means that
   --  ^ and $ will ignore \n (unless Multiple_Lines is also specified),
   --  and that '.' will match \n.

   Multiple_Lines   : constant Regexp_Flags;
   --  Treat the Data as multiple lines. This means that ^ and $ will also
   --  match on internal newlines (ASCII.LF), in addition to the beginning
   --  and end of the string.
   --
   --  This can be combined with Single_Line.

   -----------------
   -- Match_Array --
   -----------------

   subtype Match_Count is Natural range 0 .. Max_Paren_Count;

   type Match_Location is record
      First : Natural := 0;
      Last  : Natural := 0;
   end record;

   type Match_Array is array (Match_Count range <>) of Match_Location;
   --  The substring matching a given pair of parenthesis.
   --  Index 0 is the whole substring that matched the full regular
   --  expression.
   --
   --  For instance, if your regular expression is something like:
   --  "a(b*)(c+)", then Match_Array(1) will be the indexes of the
   --  substring that matched "b*" and Match_Array(2) will be the substring
   --  that matched "c+".
   --
   --  The number of parenthesis groups that can be retrieved is unlimited,
   --  and all the Match subprograms below can use a Match_Array of any size.
   --  Indexes that do not have any matching parenthesis are set to
   --  No_Match.

   No_Match : constant Match_Location := (First => 0, Last => 0);
   --  The No_Match constant is (0, 0) to differentiate between
   --  matching a null string at position 1, which uses (1, 0)
   --  and no match at all.

   ------------------------------
   -- Pattern_Matcher Creation --
   ------------------------------

   type Pattern_Matcher (Size : Program_Size) is private;
   --  Type used to represent a regular expression compiled into byte code

   Never_Match : constant Pattern_Matcher;
   --  A regular expression that never matches anything

   function Compile
     (Expression : String;
      Flags      : Regexp_Flags := No_Flags)
      return       Pattern_Matcher;
   --  Compile a regular expression into internal code.
   --  Raises Expression_Error if Expression is not a legal regular expression.
   --  The appropriate size is calculated automatically, but this means that
   --  the regular expression has to be compiled twice (the first time to
   --  calculate the size, the second time to actually generate the byte code).
   --
   --  Flags is the default value to use to set properties for Expression (case
   --  sensitivity,...).

   procedure Compile
     (Matcher         : out Pattern_Matcher;
      Expression      : String;
      Final_Code_Size : out Program_Size;
      Flags           : Regexp_Flags := No_Flags);
   --  Compile a regular expression into into internal code
   --  This procedure is significantly faster than the function
   --  Compile, as there is a known maximum size for the matcher.
   --  This function raises Storage_Error if Matcher is too small
   --  to hold the resulting code, or Expression_Error is Expression
   --  is not a legal regular expression.
   --
   --  Flags is the default value to use to set properties for Expression (case
   --  sensitivity,...).

   procedure Compile
     (Matcher    : out Pattern_Matcher;
      Expression : String;
      Flags      : Regexp_Flags := No_Flags);
   --  Same procedure as above, expect it does not return the final
   --  program size.

   function Paren_Count (Regexp : Pattern_Matcher) return Match_Count;
   pragma Inline (Paren_Count);

   --  Return the number of parenthesis pairs in Regexp.

   --  This is the maximum index that will be filled if a Match_Array is
   --  used as an argument to Match.
   --
   --  Thus, if you want to be sure to get all the parenthesis, you should
   --  do something like:
   --
   --     declare
   --        Regexp  : Pattern_Matcher := Compile ("a(b*)(c+)");
   --        Matched : Match_Array (0 .. Paren_Count (Regexp));
   --     begin
   --        Match (Regexp, "a string", Matched);
   --     end;

   -------------
   -- Quoting --
   -------------

   function Quote (Str : String) return String;
   --  Return a version of Str so that every special character is quoted.
   --  The resulting string can be used in a regular expression to match
   --  exactly Str, whatever character was present in Str.

   --------------
   -- Matching --
   --------------

   procedure Match
     (Expression     : String;
      Data           : String;
      Matches        : out Match_Array;
      Size           : Program_Size := 0);
   --  Match Expression against Data and store result in Matches.
   --  Function raises Storage_Error if Size is too small for Expression,
   --  or Expression_Error if Expression is not a legal regular expression.
   --  If Size is 0, then the appropriate size is automatically calculated
   --  by this package, but this is slightly slower.
   --
   --  At most Matches'Length parenthesis are returned.

   function  Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := 0)
      return       Natural;
   --  Return the position where Data matches, or (Data'First - 1) if there is
   --  no match.
   --  Function raises Storage_Error if Size is too small for Expression
   --  or Expression_Error if Expression is not a legal regular expression
   --  If Size is 0, then the appropriate size is automatically calculated
   --  by this package, but this is slightly slower.

   function Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := 0)
      return       Boolean;
   --  Return True if Data matches Expression. Match raises Storage_Error
   --  if Size is too small for Expression, or Expression_Error if Expression
   --  is not a legal regular expression.
   --
   --  If Size is 0, then the appropriate size is automatically calculated
   --  by this package, but this is slightly slower.

   ------------------------------------------------
   -- Matching a pre-compiled regular expression --
   ------------------------------------------------

   --  The following functions are significantly faster if you need to reuse
   --  the same regular expression multiple times, since you only have to
   --  compile it once.

   function  Match
     (Self : Pattern_Matcher;
      Data : String)
      return Natural;
   --  Return the position where Data matches, or (Data'First - 1) if there is
   --  no match. Raises Expression_Error if Expression is not a legal regular
   --  expression.

   pragma Inline (Match);
   --  All except the last one below.

   procedure Match
     (Self    : Pattern_Matcher;
      Data    : String;
      Matches : out Match_Array);
   --  Match Data using the given pattern matcher and store result in Matches.
   --  Raises Expression_Error if Expression is not a legal regular expression.
   --  The expression matches if Matches (0) /= No_Match.
   --
   --  At most Matches'Length parenthesis are returned.

   -----------
   -- Debug --
   -----------

   procedure Dump (Self : Pattern_Matcher);
   --  Dump the compiled version of the regular expression matched by Self.

--------------------------
-- Private Declarations --
--------------------------

private

   subtype Pointer is Program_Size;
   --  The Pointer type is used to point into Program_Data

   --  Note that the pointer type is not necessarily 2 bytes
   --  although it is stored in the program using 2 bytes

   type Program_Data is array (Pointer range <>) of Character;

   Program_First : constant := 1;

   --  The "internal use only" fields in regexp are present to pass
   --  info from compile to execute that permits the execute phase
   --  to run lots faster on simple cases.  They are:

   --     First              character that must begin a match or ASCII.Nul
   --     Anchored           true iff match must start at beginning of line
   --     Must_Have          pointer to string that match must include or null
   --     Must_Have_Length   length of Must_Have string

   --  First and Anchored permit very fast decisions on suitable
   --  starting points for a match, cutting down the work a lot.
   --  Must_Have permits fast rejection of lines that cannot possibly
   --  match.

   --  The Must_Have tests are costly enough that Optimize
   --  supplies a Must_Have only if the r.e. contains something potentially
   --  expensive (at present, the only such thing detected is * or +
   --  at the start of the r.e., which can involve a lot of backup).
   --  The length is supplied because the test in Execute needs it
   --  and Optimize is computing it anyway.

   --  The initialization is meant to fail-safe in case the user of this
   --  package tries to use an uninitialized matcher. This takes advantage
   --  of the knowledge that ASCII.Nul translates to the end-of-program (EOP)
   --  instruction code of the state machine.

   No_Flags         : constant Regexp_Flags := 0;
   Case_Insensitive : constant Regexp_Flags := 1;
   Single_Line      : constant Regexp_Flags := 2;
   Multiple_Lines   : constant Regexp_Flags := 4;

   type Pattern_Matcher (Size : Pointer) is record
      First            : Character    := ASCII.NUL;  --  internal use only
      Anchored         : Boolean      := False;      --  internal use only
      Must_Have        : Pointer      := 0;          --  internal use only
      Must_Have_Length : Natural      := 0;          --  internal use only
      Paren_Count      : Natural      := 0;          --  # paren groups
      Flags            : Regexp_Flags := No_Flags;
      Program          : Program_Data (Program_First .. Size) :=
                           (others => ASCII.NUL);
   end record;

   Never_Match : constant Pattern_Matcher :=
      (0, ASCII.NUL, False, 0, 0, 0, No_Flags, (others => ASCII.NUL));

end GNAT.Regpat;