summaryrefslogtreecommitdiff
path: root/external/jaxp/source/gnu/xml/pipeline/LinkFilter.java
diff options
context:
space:
mode:
Diffstat (limited to 'external/jaxp/source/gnu/xml/pipeline/LinkFilter.java')
-rw-r--r--external/jaxp/source/gnu/xml/pipeline/LinkFilter.java232
1 files changed, 0 insertions, 232 deletions
diff --git a/external/jaxp/source/gnu/xml/pipeline/LinkFilter.java b/external/jaxp/source/gnu/xml/pipeline/LinkFilter.java
deleted file mode 100644
index 811fc0300..000000000
--- a/external/jaxp/source/gnu/xml/pipeline/LinkFilter.java
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (C) 1999-2001 David Brownell
- *
- * This file is part of GNU JAXP, a library.
- *
- * GNU JAXP is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * GNU JAXP is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * As a special exception, if you link this library with other files to
- * produce an executable, this library does not by itself cause the
- * resulting executable to be covered by the GNU General Public License.
- * This exception does not however invalidate any other reasons why the
- * executable file might be covered by the GNU General Public License.
- */
-
-package gnu.xml.pipeline;
-
-import java.io.IOException;
-import java.net.URL;
-import java.util.Enumeration;
-import java.util.Vector;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-
-
-/**
- * Pipeline filter to remember XHTML links found in a document,
- * so they can later be crawled. Fragments are not counted, and duplicates
- * are ignored. Callers are responsible for filtering out URLs they aren't
- * interested in. Events are passed through unmodified.
- *
- * <p> Input MUST include a setDocumentLocator() call, as it's used to
- * resolve relative links in the absence of a "base" element. Input MUST
- * also include namespace identifiers, since it is the XHTML namespace
- * identifier which is used to identify the relevant elements.
- *
- * <p><em>FIXME:</em> handle xml:base attribute ... in association with
- * a stack of base URIs. Similarly, recognize/support XLink data.
- *
- * @author David Brownell
- */
-public class LinkFilter extends EventFilter
-{
- // for storing URIs
- private Vector vector = new Vector ();
-
- // struct for "full" link record (tbd)
- // these for troubleshooting original source:
- // original uri
- // uri as resolved (base, relative, etc)
- // URI of originating doc
- // line #
- // original element + attrs (img src, desc, etc)
-
- // XLink model of the link ... for inter-site pairups ?
-
- private String baseURI;
-
- private boolean siteRestricted = false;
-
- //
- // XXX leverage blacklist info (like robots.txt)
- //
- // XXX constructor w/param ... pipeline for sending link data
- // probably XHTML --> XLink, providing info as sketched above
- //
-
-
- /**
- * Constructs a new event filter, which collects links in private data
- * structure for later enumeration.
- */
- // constructor used by PipelineFactory
- public LinkFilter ()
- {
- super.setContentHandler (this);
- }
-
-
- /**
- * Constructs a new event filter, which collects links in private data
- * structure for later enumeration and passes all events, unmodified,
- * to the next consumer.
- */
- // constructor used by PipelineFactory
- public LinkFilter (EventConsumer next)
- {
- super (next);
- super.setContentHandler (this);
- }
-
-
- /**
- * Returns an enumeration of the links found since the filter
- * was constructed, or since removeAllLinks() was called.
- *
- * @return enumeration of strings.
- */
- public Enumeration getLinks ()
- {
- return vector.elements ();
- }
-
- /**
- * Removes records about all links reported to the event
- * stream, as if the filter were newly created.
- */
- public void removeAllLinks ()
- {
- vector = new Vector ();
- }
-
-
- /**
- * Collects URIs for (X)HTML content from elements which hold them.
- */
- public void startElement (
- String uri,
- String localName,
- String qName,
- Attributes atts
- ) throws SAXException
- {
- String link;
-
- // Recognize XHTML links.
- if ("http://www.w3.org/1999/xhtml".equals (uri)) {
-
- if ("a".equals (localName) || "base".equals (localName)
- || "area".equals (localName))
- link = atts.getValue ("href");
- else if ("iframe".equals (localName) || "frame".equals (localName))
- link = atts.getValue ("src");
- else if ("blockquote".equals (localName) || "q".equals (localName)
- || "ins".equals (localName) || "del".equals (localName))
- link = atts.getValue ("cite");
- else
- link = null;
- link = maybeAddLink (link);
-
- // "base" modifies designated baseURI
- if ("base".equals (localName) && link != null)
- baseURI = link;
-
- if ("iframe".equals (localName) || "img".equals (localName))
- maybeAddLink (atts.getValue ("longdesc"));
- }
-
- super.startElement (uri, localName, qName, atts);
- }
-
- private String maybeAddLink (String link)
- {
- int index;
-
- // ignore empty links and fragments inside docs
- if (link == null)
- return null;
- if ((index = link.indexOf ("#")) >= 0)
- link = link.substring (0, index);
- if (link.equals (""))
- return null;
-
- try {
- // get the real URI
- URL base = new URL ((baseURI != null)
- ? baseURI
- : getDocumentLocator ().getSystemId ());
- URL url = new URL (base, link);
-
- link = url.toString ();
-
- // ignore duplicates
- if (vector.contains (link))
- return link;
-
- // other than what "base" does, stick to original site:
- if (siteRestricted) {
- // don't switch protocols
- if (!base.getProtocol ().equals (url.getProtocol ()))
- return link;
- // don't switch servers
- if (base.getHost () != null
- && !base.getHost ().equals (url.getHost ()))
- return link;
- }
-
- vector.addElement (link);
-
- return link;
-
- } catch (IOException e) {
- // bad URLs we don't want
- }
- return null;
- }
-
- /**
- * Reports an error if no Locator has been made available.
- */
- public void startDocument ()
- throws SAXException
- {
- if (getDocumentLocator () == null)
- throw new SAXException ("no Locator!");
- }
-
- /**
- * Forgets about any base URI information that may be recorded.
- * Applications will often want to call removeAllLinks(), likely
- * after examining the links which were reported.
- */
- public void endDocument ()
- throws SAXException
- {
- baseURI = null;
- super.endDocument ();
- }
-}