132html


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313

#! /usr/bin/perl -w

# Script to turn PCRE man pages into HTML


# Subroutine to handle font changes and other escapes

sub do_line {
my($s) = $_[0];

$s =~ s/</&#60;/g;                   # Deal with < and >
$s =~ s/>/&#62;/g;
$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
$s =~ s"\\e"\\"g;
$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
$s;
}

# Subroutine to ensure not in a paragraph

sub end_para {
if ($inpara)
  {
  print TEMP "</PRE>\n" if ($inpre);
  print TEMP "</P>\n";
  }
$inpara = $inpre = 0;
$wrotetext = 0;
}

# Subroutine to start a new paragraph

sub new_para {
&end_para();
print TEMP "<P>\n";
$inpara = 1;
}


# Main program

$innf = 0;
$inpara = 0;
$inpre = 0;
$wrotetext = 0;
$toc = 0;
$ref = 1;

while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
  {
  $toc = 1 if $ARGV[0] eq "-toc";
  shift;
  }

# Initial output to STDOUT

print <<End ;
<html>
<head>
<title>$ARGV[0] specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>$ARGV[0] man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
End

print "<ul>\n" if ($toc);

open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";

while (<STDIN>)
  {
  # Handle lines beginning with a dot

  if (/^\./)
    {
    # Some of the PCRE man pages used to contain instances of .br. However,
    # they should have all been removed because they cause trouble in some
    # (other) automated systems that translate man pages to HTML. Complain if
    # we find .br or .in (another macro that is deprecated).

    if (/^\.br/ || /^\.in/)
      {
      print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
      print STDERR "*** $_\n";
      die "*** Processing abandoned\n";
      }

    # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.

    elsif (/^\.nf/)
      {
      $innf = 1;
      }

    elsif (/^\.fi/)
      {
      $innf = 0;
      }

    # Handling .sp is subtle. If it is inside a literal section, do nothing if
    # the next line is a non literal text line; similarly, if not inside a
    # literal section, do nothing if a literal follows. The point being that
    # the <pre> and </pre> that delimit literal sections will do the spacing.
    # Always skip if no previous output.

    elsif (/^\.sp/)
      {
      if ($wrotetext)
        {
        $_ = <STDIN>;
        if ($inpre)
          {
          print TEMP "\n" if (/^[\s.]/);
          }
        else
          {
          print TEMP "<br>\n<br>\n" if (!/^[\s.]/);
          }
        redo;    # Now process the lookahead line we just read
        }
      }
    elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
      {
      &new_para();
      }
    elsif (/^\.SH\s*("?)(.*)\1/)
      {
      # Ignore the NAME section
      if ($2 =~ /^NAME\b/)
        {
        <STDIN>;
        next;
        }

      &end_para();
      my($title) = &do_line($2);
      if ($toc)
        {
        printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
          $ref, $ref);
        printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
          $ref, $ref);
        $ref++;
        }
      else
        {
        print TEMP "<br><b>\n$title\n</b><br>\n";
        }
      }
    elsif (/^\.SS\s*("?)(.*)\1/)
      {
      &end_para();
      my($title) = &do_line($2);
      print TEMP "<br><b>\n$title\n</b><br>\n";
      }
    elsif (/^\.B\s*(.*)/)
      {
      &new_para() if (!$inpara);
      $_ = &do_line($1);
      s/"(.*?)"/$1/g;
      print TEMP "<b>$_</b>\n";
      $wrotetext = 1;
      }
    elsif (/^\.I\s*(.*)/)
      {
      &new_para() if (!$inpara);
      $_ = &do_line($1);
      s/"(.*?)"/$1/g;
      print TEMP "<i>$_</i>\n";
      $wrotetext = 1;
      }

    # A comment that starts "HREF" takes the next line as a name that
    # is turned into a hyperlink, using the text given, which might be
    # in a special font. If it ends in () or (digits) or punctuation, they
    # aren't part of the link.

    elsif (/^\.\\"\s*HREF/)
      {
      $_=<STDIN>;
      chomp;
      $_ = &do_line($_);
      $_ =~ s/\s+$//;
      $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
      print TEMP "<a href=\"$1.html\">$_</a>\n";
      }

    # A comment that starts "HTML" inserts literal HTML

    elsif (/^\.\\"\s*HTML\s*(.*)/)
      {
      print TEMP $1;
      }

    # A comment that starts < inserts that HTML at the end of the
    # *next* input line - so as not to get a newline between them.

    elsif (/^\.\\"\s*(<.*>)/)
      {
      my($markup) = $1;
      $_=<STDIN>;
      chomp;
      $_ = &do_line($_);
      $_ =~ s/\s+$//;
      print TEMP "$_$markup\n";
      }

    # A comment that starts JOIN joins the next two lines together, with one
    # space between them. Then that line is processed. This is used in some
    # displays where two lines are needed for the "man" version. JOINSH works
    # the same, except that it assumes this is a shell command, so removes
    # continuation backslashes.

    elsif (/^\.\\"\s*JOIN(SH)?/)
      {
      my($one,$two);
      $one = <STDIN>;
      $two = <STDIN>;
      $one =~ s/\s*\\e\s*$// if (defined($1));
      chomp($one);
      $two =~ s/^\s+//;
      $_ = "$one $two";
      redo;            # Process the joined lines
      }
      
    # .EX/.EE are used in the pcredemo page to bracket the entire program,
    # which is unmodified except for turning backslash into "\e".
    
    elsif (/^\.EX\s*$/)
      {
      print TEMP "<PRE>\n";
      while (<STDIN>)
        {
        last if /^\.EE\s*$/; 
        s/\\e/\\/g;
        s/&/&amp;/g;   
        s/</&lt;/g;
        s/>/&gt;/g;
        print TEMP; 
        }   
      }     

    # Ignore anything not recognized

    next;
    }

  # Line does not begin with a dot. Replace blank lines with new paragraphs

  if (/^\s*$/)
    {
    &end_para() if ($wrotetext);
    next;
    }

  # Convert fonts changes and output an ordinary line. Ensure that indented
  # lines are marked as literal.

  $_ = &do_line($_);
  &new_para() if (!$inpara);

  if (/^\s/)
    {
    if (!$inpre)
      {
      print TEMP "<pre>\n";
      $inpre = 1;
      }
    }
  elsif ($inpre)
    {
    print TEMP "</pre>\n";
    $inpre = 0;
    }

  # Add <br> to the end of a non-literal line if we are within .nf/.fi

  $_ .= "<br>\n" if (!$inpre && $innf);

  print TEMP;
  $wrotetext = 1;
  }

# The TOC, if present, will have been written - terminate it

print "</ul>\n" if ($toc);

# Copy the remainder to the standard output

close(TEMP);
open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";

print while (<TEMP>);

print <<End ;
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
End

close(TEMP);
unlink("/tmp/$$");

# End