summaryrefslogtreecommitdiff
path: root/fo/pdf2index
diff options
context:
space:
mode:
authorLorry <lorry@roadtrain.codethink.co.uk>2012-07-20 16:48:06 +0100
committerLorry <lorry@roadtrain.codethink.co.uk>2012-07-20 16:48:06 +0100
commit8e8edc70b402e69953fb9135a58d7faee12dc088 (patch)
treee8815e7ef185ccaba26bcd616eace378572b922e /fo/pdf2index
downloaddocbook-xsl-8e8edc70b402e69953fb9135a58d7faee12dc088.tar.gz
Tarball conversion
Diffstat (limited to 'fo/pdf2index')
-rwxr-xr-xfo/pdf2index140
1 files changed, 140 insertions, 0 deletions
diff --git a/fo/pdf2index b/fo/pdf2index
new file mode 100755
index 0000000..c14d8ec
--- /dev/null
+++ b/fo/pdf2index
@@ -0,0 +1,140 @@
+#!/usr/bin/perl -- # -*- Perl -*-
+
+# this needs some cleanup...
+
+my $PSTOTEXT = "pstotext";
+
+my $pdf = shift @ARGV;
+
+my $index = "";
+my $inindex = 0;
+open (F, "$PSTOTEXT $pdf |");
+while (<F>) {
+ if (/^<\/index/) {
+ $index .= $_;
+ $inindex = 0;
+ }
+ $inindex = 1 if /^<index/;
+
+ if ($inindex) {
+ $index .= $_ if /^\s*</;
+ }
+}
+
+my $cindex = "";
+while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
+ $cindex .= $1;
+ $_ = $2;
+ $index = $'; # '
+
+ my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
+
+ # Expand ranges
+ if ($#pages >= 0) {
+ my @mpages = ();
+ foreach my $page (@pages) {
+ my $pageno = &pageno($page);
+ if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
+ for (my $count = $1; $count <= $2; $count++) {
+ push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
+ }
+ } else {
+ push (@mpages, $page);
+ }
+ }
+ @pages = sort rangesort @mpages;
+ }
+
+ # Remove duplicates...
+ if ($#pages > 0) {
+ my @mpages = ();
+ my $current = "";
+ foreach my $page (@pages) {
+ my $pageno = &pageno($page);
+ if ($pageno ne $current) {
+ push (@mpages, $page);
+ $current = $pageno;
+ }
+ }
+ @pages = @mpages;
+ }
+
+ # Collapse ranges...
+ if ($#pages > 1) {
+ my @cpages = ();
+ while (@pages) {
+ my $count = 0;
+ my $len = &rangelen($count, @pages);
+ if ($len <= 2) {
+ my $page = shift @pages;
+ push (@cpages, $page);
+ } else {
+ my $fpage = shift @pages;
+ my $lpage = "";
+ while ($len > 1) {
+ $lpage = shift @pages;
+ $len--;
+ }
+ my $fpno = &pageno($fpage);
+ my $lpno = &pageno($lpage);
+ $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
+ push (@cpages, $fpage);
+ }
+ }
+ @pages = @cpages;
+ }
+
+ my $page = shift @pages;
+ $page =~ s/\s*$//s;
+ $cindex .= $page;
+ while (@pages) {
+ $page = shift @pages;
+ $page =~ s/\s*$//s;
+ $cindex .= ", $page";
+ }
+}
+$cindex .= $index;
+
+print "$cindex\n";
+
+sub pageno {
+ my $page = shift;
+
+ $page =~ s/^<phrase.*?>//;
+ $page =~ s/^<link.*?>//;
+
+ return $1 if $page =~ /^([^<>]+)/;
+ return "?";
+}
+
+sub rangesort {
+ my $apno = &pageno($a);
+ my $bpno = &pageno($b);
+
+ # Make sure roman pages come before arabic ones, otherwise sort them in order
+ return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
+ return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
+ return $apno <=> $bpno;
+}
+
+sub rangelen {
+ my $count = shift;
+ my @pages = @_;
+ my $len = 1;
+ my $inrange = 1;
+
+ my $current = &pageno($pages[$count]);
+ while ($count < $#pages && $inrange) {
+ $count++;
+ my $next = &pageno($pages[$count]);
+ if ($current + 1 eq $next) {
+ $current = $next;
+ $inrange = 1;
+ $len++;
+ } else {
+ $inrange = 0;
+ }
+ }
+
+ return $len;
+}