1 files changed, 243 insertions, 0 deletions
diff --git a/gnu/xml/pipeline/LinkFilter.java b/gnu/xml/pipeline/LinkFilter.java
new file mode 100644
index 000000000..28a450170
--- /dev/null
+++ b/gnu/xml/pipeline/LinkFilter.java
@@ -0,0 +1,243 @@
+/* LinkFilter.java -- 
+   Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING.  If not, write to the
+Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+02111-1307 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library.  Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module.  An independent module is a module which is not derived from
+or based on this library.  If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so.  If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.xml.pipeline;
+
+import java.io.IOException;
+import java.net.URL; 
+import java.util.Enumeration;
+import java.util.Vector;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Pipeline filter to remember XHTML links found in a document,
+ * so they can later be crawled.  Fragments are not counted, and duplicates
+ * are ignored.  Callers are responsible for filtering out URLs they aren't
+ * interested in.  Events are passed through unmodified.
+ *
+ * <p> Input MUST include a setDocumentLocator() call, as it's used to
+ * resolve relative links in the absence of a "base" element.  Input MUST
+ * also include namespace identifiers, since it is the XHTML namespace
+ * identifier which is used to identify the relevant elements.
+ *
+ * <p><em>FIXME:</em> handle xml:base attribute ... in association with
+ * a stack of base URIs.  Similarly, recognize/support XLink data.
+ *
+ * @author David Brownell
+ */
+public class LinkFilter extends EventFilter
+{
+    // for storing URIs
+    private Vector		vector = new Vector ();
+
+	// struct for "full" link record (tbd)
+	// these for troubleshooting original source:
+	//	original uri
+	//	uri as resolved (base, relative, etc)
+	//	URI of originating doc
+	//	line #
+	//	original element + attrs (img src, desc, etc)
+
+	// XLink model of the link ... for inter-site pairups ?
+
+    private String		baseURI;
+
+    private boolean		siteRestricted = false;
+
+    //
+    // XXX leverage blacklist info (like robots.txt)
+    //
+    // XXX constructor w/param ... pipeline for sending link data
+    // probably XHTML --> XLink, providing info as sketched above
+    //
+
+
+    /**
+     * Constructs a new event filter, which collects links in private data
+     * structure for later enumeration.
+     */
+	// constructor used by PipelineFactory
+    public LinkFilter ()
+    {
+	super.setContentHandler (this);
+    }
+
+
+    /**
+     * Constructs a new event filter, which collects links in private data
+     * structure for later enumeration and passes all events, unmodified,
+     * to the next consumer.
+     */
+	// constructor used by PipelineFactory
+    public LinkFilter (EventConsumer next)
+    {
+	super (next);
+	super.setContentHandler (this);
+    }
+
+
+    /**
+     * Returns an enumeration of the links found since the filter
+     * was constructed, or since removeAllLinks() was called.
+     *
+     * @return enumeration of strings.
+     */
+    public Enumeration getLinks ()
+    {
+	return vector.elements ();
+    }
+
+    /**
+     * Removes records about all links reported to the event
+     * stream, as if the filter were newly created.
+     */
+    public void removeAllLinks ()
+    {
+	vector = new Vector ();
+    }
+
+
+    /**
+     * Collects URIs for (X)HTML content from elements which hold them.
+     */
+    public void startElement (
+	String		uri,
+	String		localName,
+	String		qName,
+	Attributes	atts
+    ) throws SAXException
+    {
+	String	link;
+
+	// Recognize XHTML links.
+	if ("http://www.w3.org/1999/xhtml".equals (uri)) {
+
+	    if ("a".equals (localName) || "base".equals (localName)
+		    || "area".equals (localName))
+		link = atts.getValue ("href");
+	    else if ("iframe".equals (localName) || "frame".equals (localName))
+		link = atts.getValue ("src");
+	    else if ("blockquote".equals (localName) || "q".equals (localName)
+		    || "ins".equals (localName) || "del".equals (localName))
+		link = atts.getValue ("cite");
+	    else
+		link = null;
+	    link = maybeAddLink (link);
+
+	    // "base" modifies designated baseURI
+	    if ("base".equals (localName) && link != null)
+		baseURI = link;
+
+	    if ("iframe".equals (localName) || "img".equals (localName))
+		maybeAddLink (atts.getValue ("longdesc"));
+	}
+	
+	super.startElement (uri, localName, qName, atts);
+    }
+
+    private String maybeAddLink (String link)
+    {
+	int		index;
+
+	// ignore empty links and fragments inside docs
+	if (link == null)
+	    return null;
+	if ((index = link.indexOf ("#")) >= 0)
+	    link = link.substring (0, index);
+	if (link.equals (""))
+	    return null;
+
+	try {
+	    // get the real URI
+	    URL		base = new URL ((baseURI != null)
+				    ? baseURI
+				    : getDocumentLocator ().getSystemId ());
+	    URL		url = new URL (base, link);
+
+	    link = url.toString ();
+
+	    // ignore duplicates
+	    if (vector.contains (link))
+		return link;
+
+	    // other than what "base" does, stick to original site:
+	    if (siteRestricted) {
+		// don't switch protocols
+		if (!base.getProtocol ().equals (url.getProtocol ()))
+		    return link;
+		// don't switch servers
+		if (base.getHost () != null
+			&& !base.getHost ().equals (url.getHost ()))
+		    return link;
+	    }
+
+	    vector.addElement (link);
+
+	    return link;
+	    
+	} catch (IOException e) {
+	    // bad URLs we don't want
+	}
+	return null;
+    }
+
+    /**
+     * Reports an error if no Locator has been made available.
+     */
+    public void startDocument ()
+    throws SAXException
+    {
+	if (getDocumentLocator () == null)
+	    throw new SAXException ("no Locator!");
+    }
+
+    /**
+     * Forgets about any base URI information that may be recorded.
+     * Applications will often want to call removeAllLinks(), likely
+     * after examining the links which were reported.
+     */
+    public void endDocument ()
+    throws SAXException
+    {
+	baseURI = null;
+	super.endDocument ();
+    }
+}