docgen: Add html link-check script

author: Lukas Larsson <lukas@erlang.org> 2020-03-26 12:48:50 +0100
committer: Lukas Larsson <lukas@erlang.org> 2020-04-17 10:28:27 +0200
commit: d9c9864b83b47e4695e95692531b7fe98f04e213 (patch)
tree: 31d466fce830ef6af3af302e2efb9d4af31c11e5 /scripts
parent: 8e83be4fad802c2c39017cc606d3c34651ca39e1 (diff)
download: erlang-d9c9864b83b47e4695e95692531b7fe98f04e213.tar.gz
1 files changed, 533 insertions, 0 deletions
diff --git a/scripts/otp_html_check b/scripts/otp_html_check
new file mode 100755
index 0000000000..abe6245ad3
--- /dev/null
+++ b/scripts/otp_html_check
@@ -0,0 +1,533 @@
+#!/usr/bin/perl -w
+
+###########################################################################
+#
+#  Find broken links and files not referenced.
+#
+#  Author: Kent Boortz <kent@erix.ericsson.se>
+#
+###########################################################################
+
+use File::Find;
+use strict;
+
+undef $/;			# No record separator reading files
+
+###########################################################################
+#
+#  When we talk about "a page" we mean the actual page/file
+#  When we talk about "a link" we mean a referense to a page/file.
+#  All links/URL's start with an slash except the top link that is
+#  the empty string.
+#
+#  So basically we have a set of links and a set of URL's to pages and
+#  check if this is a valid combination.
+#
+###########################################################################
+
+my $debug = 1;
+my $expand_url = 0;		# If we are to expand an URL with default
+				# names like "index.html"
+my @indexes =			# The order to try URL expansion
+  (
+   "index.shtml",
+   "index.html",
+   "index.htm",
+  );
+
+my $html_ext = 'shtml|html|htm'; # HTML pages ends in these
+
+my @links;			# Set of [page,link] we want to check
+my @exclude;			# Pages/dir/prefix to exclude
+my %pages;			# Set of all files found in the file system
+				# limited by the script arguments.
+				# After the spider is done all members in the
+				# set thas has the value 1 was visited.
+
+my %missing;			# Pages not found "$page$;$link"
+my %invalid;			# After expansion it is invalid
+my %access;			# Can't access but exists
+
+my %anchor_refs;		# Absolute links including anchor part
+my %anchor_defs;		# <a name="..."> in the form "$page#$anchor"
+
+###########################################################################
+#
+#  Argument processing, see usage() function below
+#
+###########################################################################
+
+@ARGV or usage("No base directory given");
+my $base = shift @ARGV;
+-d $base or usage("Not a directory: $base");
+$base =~ m&^/& or usage("Has to be absolute path: $base");
+$base =~ s&/+$&&;		# Remove ending slash if any
+
+my $link;
+while ($link = shift @ARGV) {
+    last if $link eq '--';
+    $link =~ s&/+$&&;		# Remove ending slash if any
+    $link =~ s&$base&&;		# Make absolute URL
+    $link =~ m&^/& and usage("Invalid start point of HTML tree \"$_\"");
+    $link = "/$link";
+    push(@links,["",$link]);
+}
+
+while ($link = shift @ARGV) {
+    $link =~ s&/+$&&;		# Remove ending slash if any
+    $link =~ s&$base&&;		# Make absolute URL
+    $link =~ m&^/& and usage("Invalid exclude URL \"$_\"");
+    $link = "/$link";
+    push(@exclude,$link);
+}
+
+# OTP specific
+
+push(@links,["","/doc/index.html"]) unless @links;
+
+###########################################################################
+#
+#  Traverse all files and directories and put all possible URL's into
+#  the set %pages. When we later find a referense to a page that URL
+#  is removed from the set. When we have followed all links the set
+#  contains the pages never visited.
+#
+#  We skip files and directories in @exclude.
+#
+###########################################################################
+
+find(\&wanted,$base);
+
+sub wanted {
+  return unless -f;
+  return if /^\.info\./;
+  return if /~$/;
+
+  my $url = $File::Find::name;
+  $url =~ s&$base&&;
+  $pages{$url} = 0 unless map {$url =~ m&^$_&} @exclude;
+}
+
+
+###########################################################################
+#
+#  Spider that follow all links adding links to the @links set.
+#
+#  @links is expanded, normalized links
+#
+#  We check if there is an valid URL for this link.
+#  @links may contain links that look bad, this is cleaned up here
+#  before checking it.
+#
+###########################################################################
+
+while (@links) {
+    my $page_and_link = shift @links;
+    my ($page,$link) = @$page_and_link;
+
+    # We skip some links directly
+
+    next if $link =~ /^\w{3,10}:/i;
+    next if $link =~ /cgi-bin|cgiwrap|user-cgi/;
+    next if $link =~ /^and|or$/;
+#    next if $link eq "";
+
+#    print STDERR "1 link: $link\n";
+
+    $link = expand_link($link,\%pages) if $expand_url;
+
+    unless (exists $pages{$link}) {
+	# No page for link, mark as invalid
+	$missing{"$page$;$link"} = 1;
+	next;
+    }
+
+#    print STDERR "2 link: $link\n";
+
+    next if $pages{$link};	# If == 1 it is visited
+    $pages{$link} = 1;		# Mark as visited
+
+#    print STDERR "3 link: $link\n";
+
+#    next unless $link =~ /\.(shtml|html|htm)$/oi;
+    next unless $link =~ /\.($html_ext)$/oi;
+
+    push(@links,get_page_links($base,$link));
+}
+
+
+###########################################################################
+#
+#  Read the page and get all the links. We know that the URL for the page
+#  is absolute and that a page/file exists.
+#
+###########################################################################
+
+sub get_page_links {
+  my $base = shift;
+  my $page = shift;		# Absolute URL
+
+#  print STDERR "open: $page\n";
+
+  my $path = "$base$page";
+
+  open(HTML,$path)
+    or print STDERR "INTERNAL ERROR: Can't open page $page: $!\n";
+
+  my $html = <HTML>;
+  close HTML;
+
+#  my $url_base = $page;
+#  $url_base =~ s&/[^/]+$&&;
+
+  # Remove comments
+  $html =~ s/\<\!\-\-\s*(.*?)\s*\-\-\>//gs;
+
+#  # Remove comments and expand SSI
+#  $html =~ s/\<\!\-\-\s*(.*?)\s*\-\-\>/
+#    expand_ssi($url_base,$page,$1)/gsie;
+
+  my @links;			# Links in this document
+#  push(@links,$html =~ /\/\*URL\*\/\s*\'([^\']+\.[^\']+)\'/gsi);
+#  push(@links,$html =~ /=\s*\'([^\']+\.(?:gif|jpg|jpeg))\'/gsi);
+#  push(@links,$html =~ /option value=\s*\"(\/[^\"]+)\"/gsi);
+#  push(@links,$html =~ /option value=\s*\"([^\"]+\.[^\"]+)\"/gsi);
+# FIXME: This is not working....
+#  push(@links,$html =~ /url\s*=\s*([\w-\.\/]+)/gsi);
+#  push(@links,$html =~ /\"([^\"]+\.html)\"/gsi);
+
+  # Find real HTML links
+  push(@links,$html =~ /\<\s*\w[^\>]*\sHREF=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+  push(@links,$html =~ /\<\s*\w[^\>]*\sSRC=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+  push(@links,$html =~ /\<\s*\w[^\>]*\sLOWSRC=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+  push(@links,$html =~ /\<\s*\w[^\>]*\sBACKGROUND=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+
+  # FIXME: Now we have the raw links, if we want to complain about
+  # spaces etc this is the time.
+
+  # Remove referenses to the same page  FIXME??? Was removed , why...
+#  @links = grep {$_ and $_ !~ /^\#/} @links;
+
+  # Find the URL to the current directory
+  my $rpath = $page;
+  $rpath =~ s&/[^/]+$&&;	# Remove name
+
+  # Links pointing to the same page
+  # should look the same
+  map {$_ = normalize_link($page,$rpath,$_)} @links;
+
+#  print "XXX $page\n" if grep {m&lib/asn1-1.3.2/doc/index\.html&} @links;
+
+  map {$_ = [$page,$_]} @links;	# Add what page was referensing it
+
+  # Find the anchors
+  
+  my @anchors =
+    ($html =~ m/
+     <
+     \s*
+     A
+     [^>]*
+     \s (?: NAME|ID) \s* = \s*
+     (?: \"([^\"]*)\" | \'([^\']*)\' | ([^>\s]+) )
+     [^>]*
+     >
+    /gsix);
+
+  foreach my $anchor (@anchors) {
+      # FIXME if already there, duplicate
+      next unless defined $anchor;
+      $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+      $anchor =~ s/&lt;/</g; # 
+      $anchor =~ s/&gt;/>/g; #
+      $anchor_defs{"$page#$anchor"} = 1;
+  }
+
+  return @links;
+}
+
+
+# -------------------------------------------------------------------------
+# -------------------------------------------------------------------------
+
+sub normalize_link {
+    my $page  = shift;		# Page where we found this link
+    my $rpath = shift;		# URL to directory where we found this link
+    my $link  = shift;		# The link to normalize
+
+#    print STDERR "\n";
+#    print STDERR "1 normalize_link: $link\n";
+
+    # Handle javascript:erlhref() specially to be able to check those links.
+    if ($link =~ /^javascript:erlhref\(([^\)]*)\);$/) {
+	my($up,$part,$mod) = split(/,\s*/, $1);
+	$up  =~ tr/\'//d;
+	$part =~ tr/\'//d;
+	$mod =~ tr/\'//d;
+	my $dir;
+	if ($part =~ m&^[a-z]+/&) {
+	    $dir = "$base$rpath/${up}/$part";
+	} else {
+	    my $path = "$base$rpath/${up}lib/$part";
+	    ($dir) = <$path-*>;
+	    return $link unless defined $dir;
+	    $dir .= "/doc/html";
+	}
+	$dir =~ s&^$base&&o;
+	$link = "$dir/$mod";
+    }
+
+    return $link if $link =~ /^\w{3,10}:/i; # mailto: http: .....
+    return $link if $link =~ /\?/i;         # Contains arguments to CGI
+
+    $link =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+
+    if ($link eq "") {
+	# The empty link is a reference to URL directory
+	return $rpath;
+    } elsif ($link =~ /^#(.*)$/) {
+	# Lokal reference to anchor
+        my $anchor = $1;
+        $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+        $anchor =~ s/&lt;/</g; # 
+        $anchor =~ s/&gt;/>/g; #
+	push(@{$anchor_refs{"$page#$anchor"}}, $page);
+	return $page;
+    }
+
+    my $anchor = "";
+
+    if ($link =~ s&#(.*)$&&) {
+	# Removed page ref (anchor)
+	$anchor = $1;
+        $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+        $anchor =~ s/&lt;/</g; # 
+        $anchor =~ s/&gt;/>/g; #
+    }
+
+    $link = "" if $link eq "/";
+
+    # Make the link absolute
+    # FIXME: maybe move down.....
+
+    if ($link !~ m&^/&) {
+	if ($link) {
+	  $link = "$rpath/$link";
+	} else {
+	  $link = $rpath;
+	}
+    }
+
+    my $xlink = $link;
+
+    $link =~ s&//+&/&g;		# Replace multiple slashes with one slash
+#    $link =~ s&^(\./)+&&g;	# Remove starting dot slash "./"  (can't be if absolute)
+    $link =~ s&(/\.)+$&&;	# Remove ending slash dot "/."
+    $link =~ s&(/\.)+/&/&g;	# Remove all slash dot slash "/./"
+    $link =~ s&/+$&&;		# Remove ending slashes
+
+    # Remove a real directory part followed by ".."
+
+    while ($link =~ s&/[^/]+/\.\.&&) {}
+
+#    print STDERR "4 normalize_link: $link\n";
+
+    $link = "" if $link eq "/";	# We do this again
+
+    #    print STDERR "5 normalize_link: $link\n";
+
+    push(@{$anchor_refs{"$link#$anchor"}}, $page) if $anchor;
+
+    return $link;
+}
+
+
+# -------------------------------------------------------------------------
+# We know the link is normalized
+# -------------------------------------------------------------------------
+
+sub expand_link {
+    my $link = shift;
+    my $pages = shift;
+
+    return $link if exists $pages{$link};
+
+    my $newlink;
+
+    foreach my $index (@indexes) {
+	$newlink = "$link/$index";
+	return $newlink if exists $pages{$newlink};
+    }
+
+    return $link;
+}
+
+###########################################################################
+#
+#  Report the result
+#
+###########################################################################
+
+if (keys %missing) {
+    print "\n\n\n**** Broken links\n\n";
+    foreach (sort keys %missing) {
+	my ($page,$link) = split($;);
+	print qq(Broken Link: $page -> "$link"\n);
+    }
+}
+
+
+# Entrys in %pages that has the value 0 is not visited
+if (keys %pages) {
+    print "\n\n\n**** Files not used (that I can see)\n\n";
+    foreach my $page (sort keys %pages) {
+	next if $pages{$page};	# If == 1 it is visited
+
+	# OTP specific
+
+	next if $page =~ m&^/(man|pdf|logs|COPYRIGHT|PR.template|README)&;
+	next if $page =~ m&^/.*\.tar.gz$&;
+	next if $page =~ m&(/info|\.kwc)$&;
+
+	print qq("$page"\n);
+    }
+}
+
+
+# Remove all references that has a matching NAME=....
+map {delete $anchor_refs{$_}} keys %anchor_defs;
+
+if (keys %anchor_refs) {
+    print "\n\n\n**** References to missing anchors\n\n";
+    foreach my $ref (sort keys %anchor_refs) {
+        foreach my $anchor (sort @{$anchor_refs{$ref}}) {
+            print qq(Missing Anchor: "$ref" from ${anchor}\n);
+        }
+    }
+}
+
+
+###########################################################################
+
+sub usage {
+  print STDERR "ERROR: ",join("\n",@_),"\n" if @_;
+  print <<HERE;
+Usage: $0 BaseDirectory URL [ URLs... ] [ -- ExcludeURLs... ]
+
+This script try to find out what files are used and not of your
+HTML documents, graphic files etc. It doesn't use HTTP, i.e. you
+work off-line, so this script may fail to find a link. Javascripts
+and other extensions also makes it very hard. But for many sites
+it work very well.
+
+The base directory has to given has to start with a slash.
+
+For URLs and ExcludeURLs absolute paths or relative the base
+directory can be used.
+
+ExcludeURLs is used as prefixes of directories or files that
+should be excluded from the search.
+
+You call it something like
+
+  % $0 /test/r7a /test/r7a/doc/index.html /test/r7a/lib/*/doc/index.html
+
+or using relative start points
+
+  % $0 /test/r7a doc/index.html
+
+HERE
+  exit 1;
+}
+
+
+__END__
+
+# FIXME: The order below is important
+
+if (%access) {
+  print "\n**** Link exists but can't open\n\n";
+
+  my $file;
+
+  foreach $file (sort keys %access) {
+    print "$file\n";
+  }
+}
+
+
+if (%invalid) {
+  print "\n**** Invalid links (goes up above top directory)\n\n";
+
+  foreach (sort keys %invalid) {
+    my ($page,$link) = split($;,$_);
+    delete $done{$link};	# FIXME: xxxx
+    print "$page\n\t-> $link\n";
+  }
+}
+
+if (%done) {
+  print "\n**** Internal error, should be no files here\n\n";
+
+  foreach (sort keys %done) {
+    print "$_\n";
+  }
+}
+
+
+__END__
+###########################################################################
+
+
+sub expand_ssi {
+  my $url_base = shift;
+  my $page = shift;
+  my $comment = shift;		# Text between <!-- and -->
+
+  return "" unless $comment =~ s/^\#//;
+
+  # This is an SSI
+  unless ($comment =~ /([\w-]+)=\"([^\"]+)\"/) {
+#    print STDERR "WARNING: Unknown SSI $comment\n\ton $page\n";
+    return "";
+  }
+
+  my $op = lc($1);		# Operator
+  my $inc = $2;			# Absolute or relative URL anding in anything
+
+  if ($debug) {
+    print STDERR "X: url_base = $url_base\n";
+    print STDERR "X: page     = $page\n";
+    print STDERR "X: op       = $op\n";
+    print STDERR "X: inc      = $inc\n";
+    print STDERR "X: base     = $base\n";
+  }
+
+  unless ($op eq 'virtual') {
+#    print STDERR "WARNING: Unknown SSI $comment\n\ton $page\n";
+    return "";
+  }
+
+  $inc = make_url_absolute($url_base,$page,$inc);
+
+  my $path = "$base$inc";
+
+  if ($debug) {
+    print STDERR "X: inc      = $inc\n";
+    print STDERR "X: path     = $path\n\n";
+  }
+
+  unless (open(HTML,$path)) {
+#    print STDERR "ERROR: Can't open page $inc: $!\n";
+    $access{$inc} = 1;
+    return "";
+  }
+
+  my $html = <HTML>;
+  close HTML;
+
+  $done{$inc} = 1;		# Mark done
+
+  return $html;
+}
+
author	Lukas Larsson <lukas@erlang.org>	2020-03-26 12:48:50 +0100
committer	Lukas Larsson <lukas@erlang.org>	2020-04-17 10:28:27 +0200
commit	d9c9864b83b47e4695e95692531b7fe98f04e213 (patch)
tree	31d466fce830ef6af3af302e2efb9d4af31c11e5 /scripts
parent	8e83be4fad802c2c39017cc606d3c34651ca39e1 (diff)
download	erlang-d9c9864b83b47e4695e95692531b7fe98f04e213.tar.gz