diff options
author | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-07-20 16:48:06 +0100 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-07-20 16:48:06 +0100 |
commit | 8e8edc70b402e69953fb9135a58d7faee12dc088 (patch) | |
tree | e8815e7ef185ccaba26bcd616eace378572b922e /fo/pdf2index | |
download | docbook-xsl-8e8edc70b402e69953fb9135a58d7faee12dc088.tar.gz |
Tarball conversion
Diffstat (limited to 'fo/pdf2index')
-rwxr-xr-x | fo/pdf2index | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/fo/pdf2index b/fo/pdf2index new file mode 100755 index 0000000..c14d8ec --- /dev/null +++ b/fo/pdf2index @@ -0,0 +1,140 @@ +#!/usr/bin/perl -- # -*- Perl -*- + +# this needs some cleanup... + +my $PSTOTEXT = "pstotext"; + +my $pdf = shift @ARGV; + +my $index = ""; +my $inindex = 0; +open (F, "$PSTOTEXT $pdf |"); +while (<F>) { + if (/^<\/index/) { + $index .= $_; + $inindex = 0; + } + $inindex = 1 if /^<index/; + + if ($inindex) { + $index .= $_ if /^\s*</; + } +} + +my $cindex = ""; +while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) { + $cindex .= $1; + $_ = $2; + $index = $'; # ' + + my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg; + + # Expand ranges + if ($#pages >= 0) { + my @mpages = (); + foreach my $page (@pages) { + my $pageno = &pageno($page); + if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky - + for (my $count = $1; $count <= $2; $count++) { + push (@mpages, "<phrase role=\"$pageno\">$count</phrase>"); + } + } else { + push (@mpages, $page); + } + } + @pages = sort rangesort @mpages; + } + + # Remove duplicates... + if ($#pages > 0) { + my @mpages = (); + my $current = ""; + foreach my $page (@pages) { + my $pageno = &pageno($page); + if ($pageno ne $current) { + push (@mpages, $page); + $current = $pageno; + } + } + @pages = @mpages; + } + + # Collapse ranges... + if ($#pages > 1) { + my @cpages = (); + while (@pages) { + my $count = 0; + my $len = &rangelen($count, @pages); + if ($len <= 2) { + my $page = shift @pages; + push (@cpages, $page); + } else { + my $fpage = shift @pages; + my $lpage = ""; + while ($len > 1) { + $lpage = shift @pages; + $len--; + } + my $fpno = &pageno($fpage); + my $lpno = &pageno($lpage); + $fpage =~ s/>$fpno</>${fpno}-$lpno</s; + push (@cpages, $fpage); + } + } + @pages = @cpages; + } + + my $page = shift @pages; + $page =~ s/\s*$//s; + $cindex .= $page; + while (@pages) { + $page = shift @pages; + $page =~ s/\s*$//s; + $cindex .= ", $page"; + } +} +$cindex .= $index; + +print "$cindex\n"; + +sub pageno { + my $page = shift; + + $page =~ s/^<phrase.*?>//; + $page =~ s/^<link.*?>//; + + return $1 if $page =~ /^([^<>]+)/; + return "?"; +} + +sub rangesort { + my $apno = &pageno($a); + my $bpno = &pageno($b); + + # Make sure roman pages come before arabic ones, otherwise sort them in order + return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/); + return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/); + return $apno <=> $bpno; +} + +sub rangelen { + my $count = shift; + my @pages = @_; + my $len = 1; + my $inrange = 1; + + my $current = &pageno($pages[$count]); + while ($count < $#pages && $inrange) { + $count++; + my $next = &pageno($pages[$count]); + if ($current + 1 eq $next) { + $current = $next; + $inrange = 1; + $len++; + } else { + $inrange = 0; + } + } + + return $len; +} |