package Pod::Text; =head1 NAME Pod::Text - convert POD data to formatted ASCII text =head1 SYNOPSIS use Pod::Text; pod2text("perlfunc.pod"); Also: pod2text [B<-a>] [B<->I] < input.pod =head1 DESCRIPTION Pod::Text is a module that can convert documentation in the POD format (such as can be found throughout the Perl distribution) into formatted ASCII. Termcap is optionally supported for boldface/underline, and can enabled via C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces will be used to simulate bold and underlined text. A separate F program is included that is primarily a wrapper for Pod::Text. The single function C can take the optional options B<-a> for an alternative output format, then a B<->I option with the max terminal width, followed by one or two arguments. The first should be the name of a file to read the pod from, or "E&STDIN" to read from STDIN. A second argument, if provided, should be a filehandle glob where output should be sent. =head1 AUTHOR Tom Christiansen EFE =head1 TODO Cleanup work. The input and output locations need to be more flexible, termcap shouldn't be a global variable, and the terminal speed needs to be properly calculated. =cut use Term::Cap; require Exporter; @ISA = Exporter; @EXPORT = qw(pod2text); use vars qw($VERSION); $VERSION = "1.0204"; use locale; # make \w work right in non-ASCII lands $termcap=0; $opt_alt_format = 0; #$use_format=1; $UNDL = "\x1b[4m"; $INV = "\x1b[7m"; $BOLD = "\x1b[1m"; $NORM = "\x1b[0m"; sub pod2text { shift if $opt_alt_format = ($_[0] eq '-a'); if($termcap and !$setuptermcap) { $setuptermcap=1; my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 }; $UNDL = $term->{'_us'}; $INV = $term->{'_mr'}; $BOLD = $term->{'_md'}; $NORM = $term->{'_me'}; } $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1)) || $ENV{COLUMNS} || ($ENV{TERMCAP} =~ /co#(\d+)/)[0] || ($^O ne 'MSWin32' && $^O ne 'dos' && (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]) || 72; @_ = ("<&STDIN") unless @_; local($file,*OUTPUT) = @_; *OUTPUT = *STDOUT if @_<2; local $: = $:; $: = " \n" if $opt_alt_format; # Do not break ``-L/lib/'' into ``- L/lib/''. $/ = ""; $FANCY = 0; $cutting = 1; $DEF_INDENT = 4; $indent = $DEF_INDENT; $needspace = 0; $begun = ""; open(IN, $file) || die "Couldn't open $file: $!"; POD_DIRECTIVE: while () { if ($cutting) { next unless /^=/; $cutting = 0; } if ($begun) { if (/^=end\s+$begun/) { $begun = ""; } elsif ($begun eq "text") { print OUTPUT $_; } next; } 1 while s{^(.*?)(\t+)(.*)$}{ $1 . (' ' x (length($2) * 8 - length($1) % 8)) . $3 }me; # Translate verbatim paragraph if (/^\s/) { output($_); next; } if (/^=for\s+(\S+)\s*(.*)/s) { if ($1 eq "text") { print OUTPUT $2,""; } else { # ignore unknown for } next; } elsif (/^=begin\s+(\S+)\s*(.*)/s) { $begun = $1; if ($1 eq "text") { print OUTPUT $2.""; } next; } sub prepare_for_output { s/\s*$/\n/; &init_noremap; # need to hide E<> first; they're processed in clear_noremap s/(E<[^<>]+>)/noremap($1)/ge; $maxnest = 10; while ($maxnest-- && /[A-Z]/``$1''/sg; s/F<(.*?)>/"$1"/sg; } else { s/C<(.*?)>/`$1'/sg; } } else { s/C<(.*?)>/noremap("E${1}E")/sge; } # s/[IF]<(.*?)>/italic($1)/ge; s/I<(.*?)>/*$1*/sg; # s/[CB]<(.*?)>/bold($1)/ge; s/X<.*?>//sg; # LREF: a la HREF L s:L<([^|>]+)\|[^>]+>:$1:g; # LREF: a manpage(3f) s:L<([a-zA-Z][^\s\/]+)($[^$]+\))?>:the $1$2 manpage:g; # LREF: an =item on another manpage s{ L< ([^/]+) / ( [:\w]+ ()? ) > } {the "$2" entry in the $1 manpage}gx; # LREF: an =item on this manpage s{ ((?: L< / ( [:\w]+ ()? ) > (,?\s+(and\s+)?)? )+) } { internal_lrefs($1) }gex; # LREF: a =head2 (head1?), maybe on a manpage, maybe right here # the "func" can disambiguate s{ L< (?: ([a-zA-Z]\S+?) / )? "?(.*?)"? > }{ do { $1 # if no $1, assume it means on this page. ? "the section on \"$2\" in the $1 manpage" : "the section on \"$2\"" } }sgex; s/[A-Z]<(.*?)>/$1/sg; } clear_noremap(1); } &prepare_for_output; if (s/^=//) { # $needspace = 0; # Assume this. # s/\n/ /g; ($Cmd, $_) = split(' ', $_, 2); # clear_noremap(1); if ($Cmd eq 'cut') { $cutting = 1; } elsif ($Cmd eq 'pod') { $cutting = 0; } elsif ($Cmd eq 'head1') { makespace(); if ($opt_alt_format) { print OUTPUT "\n"; s/^(.+?)[ \t]*$/==== $1 ====/; } print OUTPUT; # print OUTPUT uc($_); $needspace = $opt_alt_format; } elsif ($Cmd eq 'head2') { makespace(); # s/(\w+)/\u\L$1/g; #print ' ' x $DEF_INDENT, $_; # print "\xA7"; s/(\w)/\xA7 $1/ if $FANCY; if ($opt_alt_format) { s/^(.+?)[ \t]*$/== $1 ==/; print OUTPUT "\n", $_; } else { print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n"; } $needspace = $opt_alt_format; } elsif ($Cmd eq 'over') { push(@indent,$indent); $indent += ($_ + 0) || $DEF_INDENT; } elsif ($Cmd eq 'back') { $indent = pop(@indent); warn "Unmatched =back\n" unless defined $indent; } elsif ($Cmd eq 'item') { makespace(); # s/\A(\s*)\*/$1\xb7/ if $FANCY; # s/^(\s*\*\s+)/$1 /; { if (length() + 3 < $indent) { my $paratag = $_; $_ = ; if (/^[=\s]/) { # tricked!, or verbatim paragraph local($indent) = $indent[$#indent - 1] || $DEF_INDENT; output($paratag); redo POD_DIRECTIVE; } &prepare_for_output; IP_output($paratag, $_); } else { local($indent) = $indent[$#indent - 1] || $DEF_INDENT; output($_, 0); } } } else { warn "Unrecognized directive: $Cmd\n"; } } else { # clear_noremap(1); makespace(); output($_, 1); } } close(IN); } ######################################################################### sub makespace { if ($needspace) { print OUTPUT "\n"; $needspace = 0; } } sub bold { my $line = shift; return $line if $use_format; if($termcap) { $line = "$BOLD$line$NORM"; } else { $line =~ s/(.)/$1\b$1/g; } # $line = "$BOLD$line$NORM" if $ansify; return $line; } sub italic { my $line = shift; return $line if $use_format; if($termcap) { $line = "$UNDL$line$NORM"; } else { $line =~ s/(.)/$1\b_/g; } # $line = "$UNDL$line$NORM" if $ansify; return $line; } # Fill a paragraph including underlined and overstricken chars. # It's not perfect for words longer than the margin, and it's probably # slow, but it works. sub fill { local $_ = shift; my $par = ""; my $indent_space = " " x $indent; my $marg = $SCREEN-$indent; my $line = $indent_space; my $line_length; foreach (split) { my $word_length = length; $word_length -= 2 while /\010/g; # Subtract backspaces if ($line_length + $word_length > $marg) { $par .= $line . "\n"; $line= $indent_space . $_; $line_length = $word_length; } else { if ($line_length) { $line_length++; $line .= " "; } $line_length += $word_length; $line .= $_; } } $par .= "$line\n" if $line; $par .= "\n"; return $par; } sub IP_output { local($tag, $_) = @_; local($tag_indent) = $indent[$#indent - 1] || $DEF_INDENT; $tag_cols = $SCREEN - $tag_indent; $cols = $SCREEN - $indent; $tag =~ s/\s*$//; s/\s+/ /g; s/^ //; $str = "format OUTPUT = \n" . (($opt_alt_format && $tag_indent > 1) ? ":" . " " x ($tag_indent - 1) : " " x ($tag_indent)) . '@' . ('<' x ($indent - $tag_indent - 1)) . "^" . ("<" x ($cols - 1)) . "\n" . '$tag, $_' . "\n~~" . (" " x ($indent-2)) . "^" . ("<" x ($cols - 5)) . "\n" . '$_' . "\n\n.\n1"; #warn $str; warn "tag is $tag, _ is $_"; eval $str || die; write OUTPUT; } sub output { local($_, $reformat) = @_; if ($reformat) { $cols = $SCREEN - $indent; s/\s+/ /g; s/^ //; $str = "format OUTPUT = \n~~" . (" " x ($indent-2)) . "^" . ("<" x ($cols - 5)) . "\n" . '$_' . "\n\n.\n1"; eval $str || die; write OUTPUT; } else { s/^/' ' x $indent/gem; s/^\s+\n$/\n/gm; s/^ /: /s if defined($reformat) && $opt_alt_format; print OUTPUT; } } sub noremap { local($thing_to_hide) = shift; $thing_to_hide =~ tr/\000-\177/\200-\377/; return $thing_to_hide; } sub init_noremap { die "unmatched init" if $mapready++; #mask off high bit characters in input stream s/([\200-\377])/"E<".ord($1).">"/ge; } sub clear_noremap { my $ready_to_print = $_[0]; die "unmatched clear" unless $mapready--; tr/\200-\377/\000-\177/; # now for the E<>s, which have been hidden until now # otherwise the interative \w<> processing would have # been hosed by the E s { E< ( ( \d+ ) | ( [A-Za-z]+ ) ) > } { do { defined $2 ? chr($2) : defined $HTML_Escapes{$3} ? do { $HTML_Escapes{$3} } : do { warn "Unknown escape: E<$1> in $_"; "E<$1>"; } } }egx if $ready_to_print; } sub internal_lrefs { local($_) = shift; s{L]+)>}{$1}g; my(@items) = split( /(?:,?\s+(?:and\s+)?)/ ); my $retstr = "the "; my $i; for ($i = 0; $i <= $#items; $i++) { $retstr .= "C<$items[$i]>"; $retstr .= ", " if @items > 2 && $i != $#items; $retstr .= " and " if $i+2 == @items; } $retstr .= " entr" . ( @items > 1 ? "ies" : "y" ) . " elsewhere in this document "; return $retstr; } BEGIN { if (ord("\t") == 9) { %HTML_Escapes = ( 'amp' => '&', # ampersand 'lt' => '<', # left chevron, less-than 'gt' => '>', # right chevron, greater-than 'quot' => '"', # double quote "Aacute" => "\xC1", # capital A, acute accent "aacute" => "\xE1", # small a, acute accent "Acirc" => "\xC2", # capital A, circumflex accent "acirc" => "\xE2", # small a, circumflex accent "AElig" => "\xC6", # capital AE diphthong (ligature) "aelig" => "\xE6", # small ae diphthong (ligature) "Agrave" => "\xC0", # capital A, grave accent "agrave" => "\xE0", # small a, grave accent "Aring" => "\xC5", # capital A, ring "aring" => "\xE5", # small a, ring "Atilde" => "\xC3", # capital A, tilde "atilde" => "\xE3", # small a, tilde "Auml" => "\xC4", # capital A, dieresis or umlaut mark "auml" => "\xE4", # small a, dieresis or umlaut mark "Ccedil" => "\xC7", # capital C, cedilla "ccedil" => "\xE7", # small c, cedilla "Eacute" => "\xC9", # capital E, acute accent "eacute" => "\xE9", # small e, acute accent "Ecirc" => "\xCA", # capital E, circumflex accent "ecirc" => "\xEA", # small e, circumflex accent "Egrave" => "\xC8", # capital E, grave accent "egrave" => "\xE8", # small e, grave accent "ETH" => "\xD0", # capital Eth, Icelandic "eth" => "\xF0", # small eth, Icelandic "Euml" => "\xCB", # capital E, dieresis or umlaut mark "euml" => "\xEB", # small e, dieresis or umlaut mark "Iacute" => "\xCD", # capital I, acute accent "iacute" => "\xED", # small i, acute accent "Icirc" => "\xCE", # capital I, circumflex accent "icirc" => "\xEE", # small i, circumflex accent "Igrave" => "\xCD", # capital I, grave accent "igrave" => "\xED", # small i, grave accent "Iuml" => "\xCF", # capital I, dieresis or umlaut mark "iuml" => "\xEF", # small i, dieresis or umlaut mark "Ntilde" => "\xD1", # capital N, tilde "ntilde" => "\xF1", # small n, tilde "Oacute" => "\xD3", # capital O, acute accent "oacute" => "\xF3", # small o, acute accent "Ocirc" => "\xD4", # capital O, circumflex accent "ocirc" => "\xF4", # small o, circumflex accent "Ograve" => "\xD2", # capital O, grave accent "ograve" => "\xF2", # small o, grave accent "Oslash" => "\xD8", # capital O, slash "oslash" => "\xF8", # small o, slash "Otilde" => "\xD5", # capital O, tilde "otilde" => "\xF5", # small o, tilde "Ouml" => "\xD6", # capital O, dieresis or umlaut mark "ouml" => "\xF6", # small o, dieresis or umlaut mark "szlig" => "\xDF", # small sharp s, German (sz ligature) "THORN" => "\xDE", # capital THORN, Icelandic "thorn" => "\xFE", # small thorn, Icelandic "Uacute" => "\xDA", # capital U, acute accent "uacute" => "\xFA", # small u, acute accent "Ucirc" => "\xDB", # capital U, circumflex accent "ucirc" => "\xFB", # small u, circumflex accent "Ugrave" => "\xD9", # capital U, grave accent "ugrave" => "\xF9", # small u, grave accent "Uuml" => "\xDC", # capital U, dieresis or umlaut mark "uuml" => "\xFC", # small u, dieresis or umlaut mark "Yacute" => "\xDD", # capital Y, acute accent "yacute" => "\xFD", # small y, acute accent "yuml" => "\xFF", # small y, dieresis or umlaut mark "lchevron" => "\xAB", # left chevron (double less than) "rchevron" => "\xBB", # right chevron (double greater than) ); } else { # This hash assumes code page IBM-1047: %HTML_Escapes = ( 'amp' => '&', # ampersand 'lt' => '<', # left chevron, less-than 'gt' => '>', # right chevron, greater-than 'quot' => '"', # double quote "Aacute" => "\x65", # capital A, acute accent "aacute" => "\x45", # small a, acute accent "Acirc" => "\x62", # capital A, circumflex accent "acirc" => "\x42", # small a, circumflex accent "AElig" => "\x9E", # capital AE diphthong (ligature) "aelig" => "\x9C", # small ae diphthong (ligature) "Agrave" => "\x64", # capital A, grave accent "agrave" => "\x44", # small a, grave accent "Aring" => "\x67", # capital A, ring "aring" => "\x47", # small a, ring "Atilde" => "\x66", # capital A, tilde "atilde" => "\x46", # small a, tilde "Auml" => "\x63", # capital A, dieresis or umlaut mark "auml" => "\x43", # small a, dieresis or umlaut mark "Ccedil" => "\x68", # capital C, cedilla "ccedil" => "\x48", # small c, cedilla "Eacute" => "\x71", # capital E, acute accent "eacute" => "\x51", # small e, acute accent "Ecirc" => "\x72", # capital E, circumflex accent "ecirc" => "\x52", # small e, circumflex accent "Egrave" => "\x74", # capital E, grave accent "egrave" => "\x54", # small e, grave accent "ETH" => "\xAC", # capital Eth, Icelandic "eth" => "\x8C", # small eth, Icelandic "Euml" => "\x73", # capital E, dieresis or umlaut mark "euml" => "\x53", # small e, dieresis or umlaut mark "Iacute" => "\x75", # capital I, acute accent "iacute" => "\x55", # small i, acute accent "Icirc" => "\x76", # capital I, circumflex accent "icirc" => "\x56", # small i, circumflex accent "Igrave" => "\x75", # capital I, grave accent "igrave" => "\x55", # small i, grave accent "Iuml" => "\x77", # capital I, dieresis or umlaut mark "iuml" => "\x57", # small i, dieresis or umlaut mark "Ntilde" => "\x69", # capital N, tilde "ntilde" => "\x49", # small n, tilde "Oacute" => "\xEE", # capital O, acute accent "oacute" => "\xCE", # small o, acute accent "Ocirc" => "\xEB", # capital O, circumflex accent "ocirc" => "\xCB", # small o, circumflex accent "Ograve" => "\xED", # capital O, grave accent "ograve" => "\xCD", # small o, grave accent "Oslash" => "\x80", # capital O, slash "oslash" => "\x70", # small o, slash "Otilde" => "\xEF", # capital O, tilde "otilde" => "\xCF", # small o, tilde "Ouml" => "\xEC", # capital O, dieresis or umlaut mark "ouml" => "\xCC", # small o, dieresis or umlaut mark "szlig" => "\x59", # small sharp s, German (sz ligature) "THORN" => "\xAE", # capital THORN, Icelandic "thorn" => "\x8E", # small thorn, Icelandic "Uacute" => "\xFE", # capital U, acute accent "uacute" => "\xDE", # small u, acute accent "Ucirc" => "\xFB", # capital U, circumflex accent "ucirc" => "\xDB", # small u, circumflex accent "Ugrave" => "\xFD", # capital U, grave accent "ugrave" => "\xDD", # small u, grave accent "Uuml" => "\xFC", # capital U, dieresis or umlaut mark "uuml" => "\xDC", # small u, dieresis or umlaut mark "Yacute" => "\xBA", # capital Y, acute accent "yacute" => "\x8D", # small y, acute accent "yuml" => "\xDF", # small y, dieresis or umlaut mark "lchevron" => "\x8A", # left chevron (double less than) "rchevron" => "\x8B", # right chevron (double greater than) ); } } 1;