summaryrefslogtreecommitdiff
path: root/ACE/examples/Web_Crawler
diff options
context:
space:
mode:
authorWilliam R. Otte <wotte@dre.vanderbilt.edu>2006-07-24 15:50:30 +0000
committerWilliam R. Otte <wotte@dre.vanderbilt.edu>2006-07-24 15:50:30 +0000
commitc44379cc7d9c7aa113989237ab0f56db12aa5219 (patch)
tree66a84b20d47f2269d8bdc6e0323f338763424d3a /ACE/examples/Web_Crawler
parent3aff90f4a822fcf5d902bbfbcc9fa931d6191a8c (diff)
downloadATCD-c44379cc7d9c7aa113989237ab0f56db12aa5219.tar.gz
Repo restructuring
Diffstat (limited to 'ACE/examples/Web_Crawler')
-rw-r--r--ACE/examples/Web_Crawler/.cvsignore1
-rw-r--r--ACE/examples/Web_Crawler/Command_Processor.cpp128
-rw-r--r--ACE/examples/Web_Crawler/Command_Processor.h98
-rw-r--r--ACE/examples/Web_Crawler/HTTP_URL.cpp87
-rw-r--r--ACE/examples/Web_Crawler/HTTP_URL.h64
-rw-r--r--ACE/examples/Web_Crawler/Iterators.cpp163
-rw-r--r--ACE/examples/Web_Crawler/Iterators.h117
-rw-r--r--ACE/examples/Web_Crawler/Makefile.am60
-rw-r--r--ACE/examples/Web_Crawler/Mem_Map_Stream.cpp240
-rw-r--r--ACE/examples/Web_Crawler/Mem_Map_Stream.h190
-rw-r--r--ACE/examples/Web_Crawler/Options.cpp177
-rw-r--r--ACE/examples/Web_Crawler/Options.h124
-rw-r--r--ACE/examples/Web_Crawler/README25
-rw-r--r--ACE/examples/Web_Crawler/URL.cpp39
-rw-r--r--ACE/examples/Web_Crawler/URL.h82
-rw-r--r--ACE/examples/Web_Crawler/URL_Addr.cpp234
-rw-r--r--ACE/examples/Web_Crawler/URL_Addr.h111
-rw-r--r--ACE/examples/Web_Crawler/URL_Status.cpp40
-rw-r--r--ACE/examples/Web_Crawler/URL_Status.h61
-rw-r--r--ACE/examples/Web_Crawler/URL_Visitor.cpp543
-rw-r--r--ACE/examples/Web_Crawler/URL_Visitor.h436
-rw-r--r--ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp53
-rw-r--r--ACE/examples/Web_Crawler/URL_Visitor_Factory.h74
-rw-r--r--ACE/examples/Web_Crawler/Web_Crawler.cpp95
-rw-r--r--ACE/examples/Web_Crawler/Web_Crawler.h62
-rw-r--r--ACE/examples/Web_Crawler/Web_Crawler.mpc7
-rw-r--r--ACE/examples/Web_Crawler/main.cpp51
27 files changed, 3362 insertions, 0 deletions
diff --git a/ACE/examples/Web_Crawler/.cvsignore b/ACE/examples/Web_Crawler/.cvsignore
new file mode 100644
index 00000000000..ba2906d0666
--- /dev/null
+++ b/ACE/examples/Web_Crawler/.cvsignore
@@ -0,0 +1 @@
+main
diff --git a/ACE/examples/Web_Crawler/Command_Processor.cpp b/ACE/examples/Web_Crawler/Command_Processor.cpp
new file mode 100644
index 00000000000..83289095444
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Command_Processor.cpp
@@ -0,0 +1,128 @@
+// $Id$
+
+#include "ace/OS_NS_string.h"
+#include "URL.h"
+#include "HTTP_URL.h"
+#include "Options.h"
+#include "Command_Processor.h"
+#include "URL_Visitor.h"
+
+ACE_RCSID(Web_Crawler, Command_Processor, "$Id$")
+
+Command::~Command (void)
+{
+}
+
+URL_Command::URL_Command (URL *url)
+ : url_ (url)
+{
+}
+
+int
+URL_Command::execute (void)
+{
+
+ ACE_CString check_string
+ (ACE_TEXT_ALWAYS_CHAR (this->url_->url_addr ().get_path_name ()));
+ if (check_string.find ("news:") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".cgi") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find ("mailto") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".gif") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".pdf") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".map") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".bmp") != ACE_CString::npos)
+ return 0;
+
+ if (check_string.find (".jpg") != ACE_CString::npos)
+ return 0;
+
+ if (this->url_->accept (OPTIONS::instance ()->visitor ()) !=0)
+ {
+ ACE_DEBUG ((LM_DEBUG,
+ "Coudnt accept url\n"));
+ return -1;
+ }
+ return 0;
+}
+
+int
+URL_Command::destroy (void)
+{
+ delete this;
+ return 0;
+}
+Command_Processor::Command_Processor (void)
+{
+}
+
+Command_Processor::~Command_Processor (void)
+{
+}
+
+int
+Command_Processor::destroy (void)
+{
+ delete this;
+ return 0;
+return 0;
+}
+
+int
+Command_Processor::execute (void)
+{
+ Command *command;
+ while (this->url_queue_.is_empty () != 1)
+ {
+ if (this->url_queue_.dequeue_head (command) != 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n", "dequeue_head"),
+ -1);
+ URL_Command *url_command = dynamic_cast<URL_Command *> (command);
+ Auto_Destroyer<URL_Command> url_command_ptr (url_command);
+ if (url_command_ptr->execute () != 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n", "Couldnt execute command"),
+ -1);
+ }
+ return 0;
+}
+
+int
+Command_Processor::insert (Command *command)
+{
+ // According to the order specified the commands are removed from the queue.
+ if (this->url_queue_.is_full() != 1)
+ {
+ if (ACE_OS::strcmp (OPTIONS::instance ()->order (), ACE_TEXT ("FIFO")) == 0)
+ {
+ if (this->url_queue_.enqueue_tail (command) !=0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ ACE_TEXT ("%p\n"), ACE_TEXT ("enqueue_tail")),
+ - 1);
+ }
+ if (ACE_OS::strcmp (OPTIONS::instance ()->order (), ACE_TEXT ("LIFO")) == 0)
+ {
+ if (this->url_queue_.enqueue_head (command) !=0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ ACE_TEXT ("%p\n"), ACE_TEXT ("enqueue_head")),
+ - 1);
+ }
+ }
+ return 0;
+}
+
+#if defined (ACE_HAS_EXPLICIT_STATIC_TEMPLATE_MEMBER_INSTANTIATION)
+template ACE_Singleton<Options, ACE_Null_Mutex> *ACE_Singleton<Options, ACE_Null_Mutex>::singleton_;
+#endif /* ACE_HAS_EXPLICIT_STATIC_TEMPLATE_MEMBER_INSTANTIATION */
diff --git a/ACE/examples/Web_Crawler/Command_Processor.h b/ACE/examples/Web_Crawler/Command_Processor.h
new file mode 100644
index 00000000000..742a316804c
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Command_Processor.h
@@ -0,0 +1,98 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// Command_Processor.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _COMMAND_PROCESSOR_H
+#define _COMMAND_PROCESSOR_H
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+#include "ace/Containers.h"
+#include "Options.h"
+
+// Forward decl.
+class URL;
+
+class Command
+{
+ // = TITLE
+ // Abstract base class for a command.
+ //
+ // = DESCRIPTION
+ // Each command is executed by a <Command_Processor>.
+public:
+ virtual ~Command (void);
+ // Virtual destructor.
+
+ virtual int execute (void) = 0;
+ // This is the entry point to execute the command.
+ virtual int destroy (void) = 0;
+};
+
+class URL_Command : public Command
+{
+ // = TITLE
+ // Defines an API for executing a command on a URL.
+ //
+ // = DESCRIPTION
+ // Each command is executed by a <Command_Processor>.
+public:
+ URL_Command (URL *);
+ // Constructor.
+
+ virtual int execute (void);
+ // Execute the URL command.
+
+ int destroy (void);
+ // Commit suicide.
+private:
+ URL *url_;
+ // Pointer to the URL.
+};
+
+class Command_Processor
+{
+ // = TITLE
+ // Execute commands that are passed to it.
+ //
+ // = DESCRIPTION
+ // This class implements the Command Processor pattern.
+public:
+ Command_Processor (void);
+
+ int insert (Command *);
+ // Insert a new <Command> into the <Command_Processor>'s queue.
+
+ int execute (void);
+ // Execute all the <Commands> in the queue.
+
+ int destroy (void);
+ // Destroy the <Command_Processor>.
+
+private:
+ friend class ACE_Shutup_GPlusPlus;
+ // Turn off g++ warning
+
+ ~Command_Processor (void);
+ // Ensure dynamic allocation.
+
+ // @@ You fill in here...
+ ACE_Unbounded_Queue<Command *> url_queue_;
+};
+
+
+#endif /* _COMMAND_PROCESSOR_H */
diff --git a/ACE/examples/Web_Crawler/HTTP_URL.cpp b/ACE/examples/Web_Crawler/HTTP_URL.cpp
new file mode 100644
index 00000000000..44ceea324d4
--- /dev/null
+++ b/ACE/examples/Web_Crawler/HTTP_URL.cpp
@@ -0,0 +1,87 @@
+// $Id$
+
+#include "ace/OS_NS_stdio.h"
+#include "ace/OS_NS_string.h"
+#include "ace/Auto_Ptr.h"
+#include "URL_Visitor.h"
+#include "Options.h"
+#include "HTTP_URL.h"
+
+ACE_RCSID(Web_Crawler, HTTP_URL, "$Id$")
+
+const ACE_URL_Addr &
+HTTP_URL::url_addr (void) const
+{
+ return this->url_addr_;
+}
+
+HTTP_URL::HTTP_URL (const ACE_URL_Addr &url_addr,
+ HTTP_URL *cp)
+ : url_addr_ (url_addr),
+ containing_page_ (cp == 0 ? this : cp)
+{
+ ACE_DEBUG ((LM_DEBUG, "HTTP_URL %s\n", url_addr.addr_to_string ()));
+}
+
+ssize_t
+HTTP_URL::send_request (void)
+{
+ size_t commandsize =
+ ACE_OS::strlen (this->url_addr ().get_path_name ())
+ + ACE_OS::strlen (this->url_addr ().get_host_name ())
+ + 20 // Extra
+ + 1 // NUL byte
+ + 16; // Protocol filler...
+
+ char *command;
+ ACE_NEW_RETURN (command,
+ char[commandsize],
+ -1);
+
+ // Ensure that the <command> memory is deallocated.
+ ACE_Auto_Basic_Array_Ptr<char> cmd_ptr (command);
+
+ ACE_OS::sprintf (cmd_ptr.get (),
+ "GET /%s HTTP/1.1\r\n",
+ ACE_TEXT_ALWAYS_CHAR (this->url_addr ().get_path_name ()));
+
+ // Send the GET command to the connected server.
+ if (this->stream ().send_n (cmd_ptr.get (),
+ ACE_OS::strlen (cmd_ptr.get ()),
+ const_cast<ACE_Time_Value *>
+ (OPTIONS::instance ()->timeout ())) > 0)
+ {
+ ACE_OS::sprintf (cmd_ptr.get (),
+ "Host: %s\r\n\r\n",
+ this->url_addr ().get_host_name ());
+
+ // IMP: The length of teh command has to be sent!
+ ssize_t retval =
+ this->stream ().send_n (cmd_ptr.get (),
+ ACE_OS::strlen (cmd_ptr.get ()),
+ const_cast<ACE_Time_Value *>
+ (OPTIONS::instance ()->timeout ()));
+ this->stream ().svc_handler ()->idle (0);
+ if (retval <= 0)
+ return -1;
+ else
+ return retval;
+ }
+ else
+ return -1;
+}
+
+int
+HTTP_URL::accept (URL_Visitor *visitor)
+{
+ // This is part of the visitor pattern.
+ return visitor->visit (*this);
+}
+
+int
+HTTP_URL::destroy (void)
+{
+ delete this;
+ return 0;
+ // Commit suicide!
+}
diff --git a/ACE/examples/Web_Crawler/HTTP_URL.h b/ACE/examples/Web_Crawler/HTTP_URL.h
new file mode 100644
index 00000000000..a926bb47938
--- /dev/null
+++ b/ACE/examples/Web_Crawler/HTTP_URL.h
@@ -0,0 +1,64 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// HTTP_URL.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _HTTP_URL_H
+#define _HTTP_URL_H
+
+#include "URL_Status.h"
+#include "URL.h"
+#include "Options.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+class HTTP_URL : public URL
+{
+ // = TITLE
+ // An ADT for an HTTP URL.
+ //
+ // = DESCRIPTION
+ // This class plays the "element" role in the Visitor pattern.
+public:
+ HTTP_URL (const ACE_URL_Addr &url_addr,
+ HTTP_URL *containing_page = 0);
+ // The <url_addr> is the URL that we're going to be visiting. We
+ // also keep track of the containing page, if any, which is used to
+ // print out more meaningful messages.
+
+ virtual int accept (URL_Visitor *visitor);
+ // Accept the visitor, which will then perform a particular
+ // visitation strategy on the URL. This method is part of the
+ // Visitor pattern.
+
+ virtual ssize_t send_request (void);
+ // Send a <GET> command to fetch the contents in the URI from the
+ // server.
+
+ virtual const ACE_URL_Addr &url_addr (void) const;
+ // Returns the URL that we represent.
+
+ int destroy (void);
+ // Commit suicide
+private:
+ ACE_URL_Addr url_addr_;
+ // Address of the URL we're connected to.
+
+ HTTP_URL *containing_page_;
+ // Page that contained us.
+};
+
+#endif /* _HTTP_URL_H */
diff --git a/ACE/examples/Web_Crawler/Iterators.cpp b/ACE/examples/Web_Crawler/Iterators.cpp
new file mode 100644
index 00000000000..98b4f999622
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Iterators.cpp
@@ -0,0 +1,163 @@
+// $Id$
+
+#include "Options.h"
+#include "Iterators.h"
+
+ACE_RCSID(Web_Crawler, Iterators, "$Id$")
+
+URL_Iterator::~URL_Iterator (void)
+{
+}
+
+int
+URL_Iterator::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+HTML_Body_Iterator::HTML_Body_Iterator (URL &url)
+ : url_ (url)
+{
+}
+
+int
+HTML_Body_Iterator::next (ACE_CString &url)
+{
+ size_t len = BUFSIZ;
+ const char *buf;
+ ACE_CString buffer;
+ int href_index = 0;
+
+ for (buf = this->url_.stream ().recv (len);
+ buf > 0;
+ buf = this->url_.stream ().recv (len))
+ {
+
+ buffer.set (buf, BUFSIZ, 1);
+
+ href_index = buffer.find ("HREF");
+
+ if (href_index < 0)
+ href_index = buffer.find ("href");
+
+ // Grep fpr " and grab the string until end-"
+ if ( href_index > 0)
+ {
+ // Get back to buffer start location.
+ this->url_.stream ().seek (-1 * static_cast<off_t> (len),
+ SEEK_CUR);
+
+ int start_index = buffer.find ('\"',
+ href_index);
+ if (start_index <= 0)
+ break;
+
+ start_index += href_index;
+
+ int end_index = buffer.find ('\"',
+ start_index + 1);
+ if (end_index <= 0)
+ break;
+
+ end_index += start_index + 1;
+
+ ssize_t url_len = end_index - (start_index + 1);
+
+ ACE_CString temp = buffer.substring (start_index + 1,
+ url_len);
+ url.set (temp.c_str (), len, 1);
+
+ this->url_.stream ().seek (end_index + 1);
+
+ return url_len;
+ }
+ }
+ return 0;
+
+}
+
+HTTP_Header_Iterator::HTTP_Header_Iterator (URL &url)
+ : url_ (url),
+ end_of_header_ (0)
+{
+}
+
+int
+HTTP_Header_Iterator::next (ACE_CString &line)
+{
+ if (this->end_of_header_)
+ return 0;
+ else
+ {
+ for (char c;
+ (c = this->url_.stream ().get_char ()) != (char)EOF;
+ )
+ {
+ // Check to see if we're at the end of the header line.
+ if (c == '\r' && this->url_.stream ().peek_char (0) == '\n')
+ {
+ line.set (this->url_.stream ().recv (),
+ this->url_.stream ().recv_len () - 1,
+ 1);
+
+ // Check to see if we're at the end of the header.
+ if (this->url_.stream ().peek_char (1) == '\r'
+ && this->url_.stream ().peek_char (2) == '\n')
+ {
+ this->end_of_header_ = 1;
+ // We're at the end of the header section.
+ this->url_.stream ().seek (3);
+ }
+ else
+ // We're at the end of the line.
+ this->url_.stream ().seek (1);
+
+ return 1;
+ }
+ // Handle broken Web servers that use '\n' instead of
+ // '\r\n'.
+ else if (c == '\n')
+ {
+ line.set (this->url_.stream ().recv (),
+ (this->url_.stream ().recv_len ()),
+ 1);
+
+ // Check to see if we're at the end of the header.
+ if (this->url_.stream ().peek_char (0) == '\n')
+ {
+ // We're at the end of the header section.
+ this->url_.stream ().seek (1);
+ this->end_of_header_ = 1;
+ }
+
+ return 1;
+ }
+ }
+
+ }
+ return 0;
+}
+
+URL_Download_Iterator::URL_Download_Iterator (URL &url)
+ : url_ (url)
+{
+}
+
+int
+URL_Download_Iterator::next (ACE_CString &buffer)
+{
+ size_t len = BUFSIZ;
+
+ const char *buf = this->url_.stream ().recv (len);
+
+
+ if (buf == 0)
+ return 0;
+ else
+ {
+ buffer.set (buf, len, 1);
+ return 1;
+ }
+}
diff --git a/ACE/examples/Web_Crawler/Iterators.h b/ACE/examples/Web_Crawler/Iterators.h
new file mode 100644
index 00000000000..b5d267f7afb
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Iterators.h
@@ -0,0 +1,117 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// Iterators.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _ITERATORS_H
+#define _ITERATORS_H
+
+#include "URL.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+class URL_Iterator
+{
+ // = TITLE
+ // An abstract base class that defines an iterator.
+ //
+ // = DESCRIPTION
+ // Subclasses of this base class can define what strings
+ // to return from <next>. This class decouples higher-level
+ // software from the details of whatever type of URL header or
+ // body we're iterating over.
+public:
+ // = Initialization and termination methods.
+ virtual int destroy (void);
+ // "virtual" destructor.
+
+ // = Iterator methods.
+ virtual int next (ACE_CString &string) = 0;
+ // Pass back the next <string> that hasn't been seen yet. Returns 0
+ // when all items have been seen, else 1.
+
+protected:
+ virtual ~URL_Iterator (void);
+ // C++ destructor.
+};
+
+class HTML_Body_Iterator : public URL_Iterator
+{
+ // = TITLE
+ // An iterator that returns URLs embedded in HTML files.
+public:
+ // = Initialization and termination methods.
+ HTML_Body_Iterator (URL &url);
+ // Constructor.
+
+ // = Iterator methods.
+ virtual int next (ACE_CString &url);
+ // Pass back the next <url> that hasn't been seen in the
+ // memory-mapped file. Returns 0 when all items have been seen,
+ // else 1.
+
+private:
+ URL &url_;
+ // HTTP URL that we're iterating over.
+};
+
+class HTTP_Header_Iterator : public URL_Iterator
+{
+ // = TITLE
+ // An iterator that iterates over the HTTP header.
+public:
+ // = Initialization and termination methods.
+ HTTP_Header_Iterator (URL &url);
+ // Constructor.
+
+ // = Iterator methods.
+ virtual int next (ACE_CString &line);
+ // Pass back the next <line> that hasn't been seen in the
+ // memory-mapped file header. Returns 0 when we've reached the end
+ // of the header. seen, else 1.
+
+private:
+ URL &url_;
+ // HTTP URL that we're iterating over.
+
+ int end_of_header_;
+ // We've found the end of the header, which means this iterator is
+ // finished.
+};
+
+class URL_Download_Iterator : public URL_Iterator
+{
+ // = TITLE
+ // An iterator that iterates over the contents of an entire URL,
+ // i.e., both header and body, and returns it in <BUFSIZ>
+ // <buffer>s.
+public:
+ // = Initialization and termination methods.
+ URL_Download_Iterator (URL &url);
+ // Constructor.
+
+ // = Iterator methods.
+ virtual int next (ACE_CString &buffer);
+ // Pass back the next <buffer> data from the stream, where
+ // <buffer.size> <= <BUFSIZ> . Returns 0 when we've reached the end
+ // of the header, else 1.
+
+private:
+ URL &url_;
+ // HTTP URL that we're iterating over.
+};
+
+#endif /* _ITERATORS_H */
diff --git a/ACE/examples/Web_Crawler/Makefile.am b/ACE/examples/Web_Crawler/Makefile.am
new file mode 100644
index 00000000000..00a30a4d2e1
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Makefile.am
@@ -0,0 +1,60 @@
+## Process this file with automake to create Makefile.in
+##
+## $Id$
+##
+## This file was generated by MPC. Any changes made directly to
+## this file will be lost the next time it is generated.
+##
+## MPC Command:
+## /acebuilds/ACE_wrappers-repository/bin/mwc.pl -include /acebuilds/MPC/config -include /acebuilds/MPC/templates -feature_file /acebuilds/ACE_wrappers-repository/local.features -noreldefs -type automake -exclude build,Kokyu
+
+ACE_BUILDDIR = $(top_builddir)
+ACE_ROOT = $(top_srcdir)
+
+
+## Makefile.Web_Crawler.am
+
+if !BUILD_ACE_FOR_TAO
+noinst_PROGRAMS = main
+
+main_CPPFLAGS = \
+ -I$(ACE_ROOT) \
+ -I$(ACE_BUILDDIR)
+
+main_SOURCES = \
+ Command_Processor.cpp \
+ HTTP_URL.cpp \
+ Iterators.cpp \
+ Mem_Map_Stream.cpp \
+ Options.cpp \
+ URL.cpp \
+ URL_Addr.cpp \
+ URL_Status.cpp \
+ URL_Visitor.cpp \
+ URL_Visitor_Factory.cpp \
+ Web_Crawler.cpp \
+ main.cpp \
+ Command_Processor.h \
+ HTTP_URL.h \
+ Iterators.h \
+ Mem_Map_Stream.h \
+ Options.h \
+ URL.h \
+ URL_Addr.h \
+ URL_Status.h \
+ URL_Visitor.h \
+ URL_Visitor_Factory.h \
+ Web_Crawler.h
+
+main_LDADD = \
+ $(ACE_BUILDDIR)/ace/libACE.la
+
+endif !BUILD_ACE_FOR_TAO
+
+## Clean up template repositories, etc.
+clean-local:
+ -rm -f *~ *.bak *.rpo *.sym lib*.*_pure_* core core.*
+ -rm -f gcctemp.c gcctemp so_locations *.ics
+ -rm -rf cxx_repository ptrepository ti_files
+ -rm -rf templateregistry ir.out
+ -rm -rf ptrepository SunWS_cache Templates.DB
diff --git a/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp b/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp
new file mode 100644
index 00000000000..dda1d465a71
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp
@@ -0,0 +1,240 @@
+// $Id$
+
+#include "ace/FILE_Addr.h"
+#include "ace/Auto_Ptr.h"
+#include "Options.h"
+#include "Mem_Map_Stream.h"
+
+ACE_RCSID(Web_Crawler, Mem_Map_Stream, "$Id$")
+
+ACE_SOCK_Stream &
+Mem_Map_Stream::stream (void)
+{
+ return svc_handler_->peer ();
+}
+
+ssize_t
+Mem_Map_Stream::send_n (const void *buf, size_t size, ACE_Time_Value *tv)
+{
+ return svc_handler_->peer ().send_n (buf, size, 0, tv);
+}
+
+int
+Mem_Map_Stream::eof (void) const
+{
+ return this->get_pos_ >= this->end_of_mapping_plus1_;
+}
+
+int
+Mem_Map_Stream::get_char (void)
+{
+ if (this->eof () && this->grow_file_and_remap () == -1)
+ return EOF;
+
+ return *this->get_pos_++;
+}
+
+int
+Mem_Map_Stream::rewind (void)
+{
+ this->recv_pos_ =
+ reinterpret_cast<char *> (this->mem_map_.addr ());
+ this->get_pos_ = this->recv_pos_;
+ this->end_of_mapping_plus1_ =
+ this->recv_pos_ + this->mem_map_.size ();
+ return 0;
+}
+
+int
+Mem_Map_Stream::peek_char (size_t offset)
+{
+ // We may need to iterate if the size of <n> is large.
+ while (this->get_pos_ + offset >= this->end_of_mapping_plus1_)
+ if (this->grow_file_and_remap () == -1)
+ return EOF;
+
+ return this->get_pos_[offset];
+}
+
+const char *
+Mem_Map_Stream::recv (void) const
+{
+ return this->recv_pos_;
+}
+
+const char *
+Mem_Map_Stream::recv (size_t &len)
+{
+ if (this->eof () && this->grow_file_and_remap () == -1)
+ return 0;
+
+
+ const char *s = this->recv_pos_;
+ off_t olen = static_cast <off_t> (len);
+ this->seek (olen, SEEK_CUR);
+ len = this->get_pos_ - s;
+ return s;
+}
+
+size_t
+Mem_Map_Stream::recv_len (void) const
+{
+ return this->get_pos_ - this->recv_pos_;
+}
+
+const char *
+Mem_Map_Stream::peek_str (size_t offset,
+ size_t size)
+{
+ // We will iterate if the size of <offset> is large.
+ while (this->get_pos_ + (offset + size) > this->end_of_mapping_plus1_)
+ if (this->grow_file_and_remap () == -1)
+ return 0;
+
+ return &this->get_pos_[offset];
+}
+
+off_t
+Mem_Map_Stream::seek (off_t offset, int whence)
+{
+ switch (whence)
+ {
+ case SEEK_SET:
+ this->get_pos_ =
+ reinterpret_cast<char *> (this->mem_map_.addr ())
+ + offset;
+ break;
+
+ case SEEK_CUR:
+ this->get_pos_ += offset;
+ break;
+
+ case SEEK_END:
+ this->get_pos_ =
+ this->end_of_mapping_plus1_ + offset;
+ // @@ Not sure how to implement this (yet).
+ ACE_NOTSUP_RETURN (-1);
+ break;
+ }
+
+ // Make sure that the backing store will cover this.
+ while (this->get_pos_ > this->end_of_mapping_plus1_)
+ if (this->grow_file_and_remap () == -1)
+ return (off_t) -1;
+
+ this->recv_pos_ = this->get_pos_;
+ return this->recv_pos_ - reinterpret_cast<char *> (this->mem_map_.addr ());
+}
+
+Mem_Map_Stream::Svc_Handler *
+Mem_Map_Stream::svc_handler (void)
+{
+ return this->svc_handler_;
+}
+
+
+int
+Mem_Map_Stream::open (STRAT_CONNECTOR *connector,
+ const ACE_INET_Addr &addr)
+{
+ svc_handler_ = 0;
+
+ // Connect to the server at <addr>. If the handler has to be
+ // connected to the server again, the Caching strategy takes care
+ // and uses the same connection.
+ if (connector->connect (svc_handler_,
+ addr) == -1)
+ {
+
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p %s %d\n",
+ "Connect failed",
+ addr.get_host_name (),
+ addr.get_port_number ()),
+ -1);
+ }
+ // Create a temporary filename.
+ ACE_FILE_Addr file (ACE_sap_any_cast (ACE_FILE_Addr &));
+
+ // Create the temporary file via the <ACE_Mem_Map> class API.
+ if (this->mem_map_.open (file.get_path_name (),
+ O_RDWR | O_CREAT | O_APPEND,
+ ACE_DEFAULT_FILE_PERMS) == -1)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "open"),
+ -1);
+ // Make sure to unlink this right away so that if this process
+ // crashes these files will be removed automatically.
+#if 0
+ else if (ACE_OS::unlink (file.get_path_name ()) == -1)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "unlink"),
+ -1);
+ else
+#endif
+ // Initialize all the position pointers to 0.
+ this->rewind ();
+
+ return 0;
+}
+
+int
+Mem_Map_Stream::grow_file_and_remap (void)
+{
+ char buf[BUFSIZ + 1];
+
+ // Copy the next chunk of bytes from the socket into the temporary
+ // file.
+ ACE_Time_Value tv (*OPTIONS::instance ()->timeout ());
+
+ ssize_t n = this->svc_handler_->peer ().recv_n (buf,
+ sizeof buf,
+ 0,
+ &tv);
+ if (n == -1)
+ {
+ if (OPTIONS::instance ()->debug ())
+ ACE_ERROR ((LM_ERROR,
+ "%p\n",
+ "recv"));
+ return -1;
+ }
+ else if (n == 0)
+ return -1;
+ else if (ACE::write_n (this->mem_map_.handle (), buf, n) != n)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "write_n"),
+ -1);
+
+ // Grow the memory-mapping to encompass the entire temporary file.
+ if (this->mem_map_.map (-1,
+ PROT_RDWR,
+ ACE_MAP_PRIVATE | ACE_MAP_FIXED,
+ ACE_DEFAULT_BASE_ADDR) == -1)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "map"),
+ -1);
+ // MAP_FAILED is used as a "first time in" flag.
+ if (this->recv_pos_ == MAP_FAILED)
+ {
+ this->recv_pos_ = reinterpret_cast<char *> (this->mem_map_.addr ());
+ this->get_pos_ = this->recv_pos_;
+ }
+
+ this->end_of_mapping_plus1_ =
+ reinterpret_cast<char *> (this->mem_map_.addr ())
+ + this->mem_map_.size ();
+
+ return 0;
+}
+
+Mem_Map_Stream::~Mem_Map_Stream (void)
+{
+ // Remove the mapping and the file.
+ this->mem_map_.remove ();
+}
+
diff --git a/ACE/examples/Web_Crawler/Mem_Map_Stream.h b/ACE/examples/Web_Crawler/Mem_Map_Stream.h
new file mode 100644
index 00000000000..3595f04ab77
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Mem_Map_Stream.h
@@ -0,0 +1,190 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// Mem_Map_Stream.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _MEM_MAP_STREAM_H
+#define _MEM_MAP_STREAM_H
+#include /**/ "ace/pre.h"
+
+#include "ace/SOCK_Stream.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+
+#include "ace/Mem_Map.h"
+#include "ace/SOCK_Connector.h"
+#include "ace/Connector.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Strategies_T.h"
+
+class Mem_Map_Stream
+{
+ // = TITLE
+ // Provides a memory-mapped stream abstraction to simplify parsing
+ // of tokens.
+ //
+ // = DESCRIPTION
+ // This class makes it possible to treat an connection as a stream
+ // of bytes, similar to the C library stdio streams. The contents
+ // of the connection are buffered incrementally in a memory-mapped
+ // file. This class maintains pointers to two positions in the
+ // stream:
+ //
+ // 1. The <recv> position, which keeps track of the beginning of a
+ // token that is in the stream.
+ //
+ // 2. The <get> position, which moves along character-by-character
+ // until the end of the token is reached.
+ //
+ // Once a token has been located, it can be extracted from the
+ // stream by calling the <recv>. The length of the token, i.e.,
+ // the <recv_len>, is the length in bytes between the <get>
+ // position and the <recv> position. Once the token has been
+ // extracted, the <recv> and <get> positions can be updated by the
+ // <seek> method.
+
+public:
+ typedef ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_NULL_SYNCH> Svc_Handler;
+
+ typedef ACE_Strategy_Connector<Svc_Handler,
+ ACE_SOCK_CONNECTOR>
+ STRAT_CONNECTOR;
+
+ // Mem_Map_Stream (void);
+ // constructor added:KIRTHIKA
+ virtual int open (STRAT_CONNECTOR *connector,
+ const ACE_INET_Addr &);
+ // Initialize this object.
+
+ virtual ~Mem_Map_Stream (void);
+ // Destructor.
+
+ // = Accessor.
+ ACE_SOCK_Stream &stream (void);
+ // Returns the underlying <ACE_SOCK_Stream>.
+
+ // = I/O methods.
+
+ virtual ssize_t send_n (const void *buf,
+ size_t size,
+ ACE_Time_Value *tv = 0);
+ // Send <size> bytes in <buf> to the connected peer. This is a
+ // completely unbuffered call.
+
+ virtual int get_char (void);
+ // Return the next character in the stream and advance the <get>
+ // position. Returns EOF when the <get> position reaches the end of
+ // the HTTP stream.
+
+ virtual const char *recv (size_t &len);
+ // Returns a pointer to array of at most <len> characters starting
+ // at the <recv> position. If the <recv> position + <len> extends
+ // past the EOF then <len> is set to the number of characters
+ // between the <recv> position and the EOF and both the <get> and
+ // <recv> positions are advanced by <len>. Returns 0 if the <recv>
+ // position is at the EOF.
+
+ virtual const char *recv (void) const;
+ // Returns a pointer to array of characters starting at the <recv>
+ // position.
+
+ virtual size_t recv_len (void) const;
+ // Returns the length in bytes between the <get> position and the
+ // <recv> position.
+
+ virtual int rewind (void);
+ // Resets the <get> and <recv> positions to the beginning of the
+ // stream. This works since all the data has been cached in the
+ // memory-mapped backing store.
+
+ virtual int peek_char (size_t offset);
+ // Returns the nth character <offset> from the <get> position in the
+ // stream without advancing the <get> position. Automatically
+ // extends the backing store if necessary. Returns EOF if <offset>
+ // is past the end of the stream.
+
+ virtual const char *peek_str (size_t offset, size_t size);
+ // Return a pointer to an array of <size> characters starting at
+ // <offset> characters from the <get> position in the stream without
+ // advancing the <get> position. Automatically extends the backing
+ // store if necessary. Returns 0 if <offset> or <offset + size> is
+ // past the end of the stream.
+
+ virtual off_t seek (off_t offset, int whence = SEEK_CUR);
+ // Sets the <get> and <recv> positions as follows:
+ // o If <whence> is <SEEK_SET>, the positions are set to <offset>
+ // bytes from the start of the stream.
+ //
+ // o If <whence> is <SEEK_CUR>, the positions are set to the
+ // current <get> position plus <offset>.
+ //
+ // o If <whence> is <SEEK_END>, the positions are set to the size
+ // of the stream plus <offset>.
+
+ virtual int eof (void) const;
+ // Returns 1 if we're at the end of the HTTP stream, else 0.
+
+
+ /*
+ typedef ACE_NOOP_Creation_Strategy<Svc_Handler>
+ NULL_CREATION_STRATEGY;
+ typedef ACE_NOOP_Concurrency_Strategy<Svc_Handler>
+ NULL_ACTIVATION_STRATEGY;
+ typedef ACE_Cached_Connect_Strategy<Svc_Handler,
+ ACE_SOCK_CONNECTOR,
+ ACE_SYNCH_NULL_MUTEX>
+ CACHED_CONNECT_STRATEGY;*/
+
+ Svc_Handler *svc_handler (void);
+
+private:
+ int grow_file_and_remap (void);
+ // Grow the file by reading another chunk from the HTTP socket and
+ // extend the mapping to cover this chunk. Returns -1 on failure or
+ // EOF, else 0.
+
+ //ACE_SOCK_Stream stream_;
+
+ Svc_Handler *svc_handler_;
+ // Connection to peer. The granularity is at the Svc_Handler level.
+ // The Svc_Handler has an SOCK_Stream.
+ /*
+ NULL_CREATION_STRATEGY creation_strategy_;
+ NULL_ACTIVATION_STRATEGY activation_strategy_;
+ // Configure the Strategy Connector with a strategy that caches
+ // connection.
+ CACHED_CONNECT_STRATEGY caching_connect_strategy_;
+
+ STRAT_CONNECTOR *strat_connector_; */
+
+ ACE_Mem_Map mem_map_;
+ // Memory-mapped file that we're iterating over.
+
+ char *recv_pos_;
+ // Pointer to the address where the next <recv> method will start.
+
+ char *get_pos_;
+ // Pointer to the address where the next <get_char> method will
+ // start.
+
+ char *end_of_mapping_plus1_;
+ // Address at the end of the file mapping.
+
+};
+
+#include /**/ "ace/post.h"
+#endif /* _MEM_MAP_STREAM_H */
diff --git a/ACE/examples/Web_Crawler/Options.cpp b/ACE/examples/Web_Crawler/Options.cpp
new file mode 100644
index 00000000000..389cbfa0733
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Options.cpp
@@ -0,0 +1,177 @@
+// $Id$
+
+#include "ace/Get_Opt.h"
+#include "ace/Log_Msg.h"
+#include "URL_Addr.h"
+#include "Options.h"
+#include "ace/OS_NS_string.h"
+
+ACE_RCSID(Web_Crawler, Options, "$Id$")
+
+int
+Options::parse_args (int argc, ACE_TCHAR *argv[])
+{
+ ACE_Get_Opt getopt (argc, argv, ACE_TEXT ("df:h:i:l:rt:u:vo:p:"));
+
+ ACE_LOG_MSG->open (argv[0]);
+
+ this->hostname_ = ACE_TEXT ("www.cs.wustl.edu");
+ this->uri_ = ACE_TEXT ("index.html");
+ this->recurse_ = 0;
+ this->debug_ = 0;
+ this->timeout_.sec (ACE_DEFAULT_TIMEOUT);
+ this->url_filter_ = 0;
+ this->verbose_ = 0;
+ this->order_ = ACE_TEXT ("FIFO");
+ this->port_no_ = ACE_DEFAULT_HTTP_PORT;
+
+ // The default is to make this limit as large as possible.
+ this->handle_limit_ = -1;
+
+ for (int c;
+ (c = getopt ()) != EOF;
+ )
+ switch (c)
+ {
+ case ACE_TEXT ('d'):
+ this->debug_ = 1;
+ break;
+ case ACE_TEXT ('f'):
+ this->url_filter_ = getopt.opt_arg ();
+ break;
+ case ACE_TEXT ('h'):
+ this->hostname_ = getopt.opt_arg ();
+ break;
+ case ACE_TEXT ('i'):
+ this->uri_ = getopt.opt_arg ();
+ break;
+ case ACE_TEXT ('l'):
+ this->handle_limit_ = ACE_OS::atoi (getopt.opt_arg ());
+ break;
+ case ACE_TEXT ('r'):
+ this->recurse_ = 1;
+ break;
+ case ACE_TEXT ('t'):
+ this->timeout_.sec (ACE_OS::atoi (getopt.opt_arg ()));
+ break;
+ case ACE_TEXT ('u'):
+ {
+ this->hostname_ = getopt.opt_arg ();
+ ACE_TCHAR *s = ACE_OS::strchr (getopt.opt_arg (), ACE_TEXT ('/'));
+ if (s != 0)
+ {
+ this->uri_ = s + 1;
+ *s = ACE_TEXT ('\0');
+ }
+ else
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("invalid URL %s\n"),
+ getopt.opt_arg ()));
+ }
+ break;
+ case ACE_TEXT ('v'):
+ this->verbose_ = 1;
+ break;
+ case ACE_TEXT ('o'):
+ {
+ this->order_ = getopt.opt_arg ();
+ }
+ break;
+ case ACE_TEXT ('p'):
+ this->port_no_ = ACE_OS::atoi (getopt.opt_arg ());
+ break;
+ default:
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("usage: %n [-d] [-f filter] [-h hostname]")
+ ACE_TEXT (" [-l handle-limit] [-r] [-t timeout] [-u URI]")
+ ACE_TEXT (" [-v]\n%a"),
+ 1));
+
+ /* NOTREACHED */
+ }
+
+ return 0;
+}
+
+int
+Options::port_no (void) const
+{
+ return this->port_no_;
+}
+
+int
+Options::recurse (void) const
+{
+ return this->recurse_;
+}
+
+const ACE_Time_Value *
+Options::timeout (void) const
+{
+ return &this->timeout_;
+}
+
+int
+Options::debug (void) const
+{
+ return this->debug_;
+}
+
+int
+Options::verbose (void) const
+{
+ return this->verbose_;
+}
+
+const ACE_TCHAR *
+Options::order (void) const
+{
+ return this->order_;
+}
+const ACE_TCHAR *
+Options::hostname (void) const
+{
+ return this->hostname_;
+}
+
+const ACE_TCHAR *
+Options::path_name (void) const
+{
+ return this->uri_;
+}
+
+const ACE_TCHAR *
+Options::url_filter (void) const
+{
+ return this->url_filter_;
+}
+
+Command_Processor *
+Options::command_processor (void) const
+{
+ return this->command_processor_;
+}
+
+void
+Options::command_processor (Command_Processor *cp)
+{
+ this->command_processor_ = cp;
+}
+
+URL_Visitor *
+Options::visitor (void) const
+{
+ return this->visitor_;
+}
+
+void
+Options::visitor (URL_Visitor *v)
+{
+ this->visitor_ = v;
+}
+
+int
+Options::handle_limit (void)
+{
+ return this->handle_limit_;
+}
diff --git a/ACE/examples/Web_Crawler/Options.h b/ACE/examples/Web_Crawler/Options.h
new file mode 100644
index 00000000000..ef5f2efd40c
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Options.h
@@ -0,0 +1,124 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// Options.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _OPTIONS_H
+#define _OPTIONS_H
+
+#include "ace/Null_Mutex.h"
+#include "ace/Singleton.h"
+#include "ace/Time_Value.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+// Forward decls.
+class Command_Processor;
+class URL_Visitor;
+
+class Options
+{
+ // = TITLE
+ // Maintains the global options.
+ //
+ // = DESCRIPTION
+ // This class is converted into a Singleton by the
+ // <ACE_Singleton> template.
+public:
+ int parse_args (int argc, ACE_TCHAR *argv[]);
+ // Parse the command-line arguments and initialize the options.
+
+ int recurse (void) const;
+ // If non-0 and the link is an HTML file then recursively check all
+ // links that are embedded in the body of file.
+
+ const ACE_TCHAR *hostname (void) const;
+ // Return the hostname of the initial Web server.
+
+ const ACE_TCHAR *path_name (void) const;
+ // Return the initial URI.
+
+ const ACE_TCHAR *url_filter (void) const;
+ // String used to filter out which URLs to validate.
+
+ int debug (void) const;
+ // Are we debugging?
+
+ int verbose (void) const;
+ // Are we being verbose?
+
+ const ACE_TCHAR *order (void) const;
+ // Which order? LIFO|FIFO??
+
+ int port_no (void) const;
+ // Port #
+
+ const ACE_Time_Value *timeout (void) const;
+ // Return the timeout used to prevent hanging on <recv> and
+ // <connect> calls to broken servers.
+
+ // = Get/set the <Command_Processor>.
+ Command_Processor *command_processor (void) const;
+ void command_processor (Command_Processor *);
+
+ // = Get/set the <URL_Visitor>.
+ URL_Visitor *visitor (void) const;
+ void visitor (URL_Visitor *);
+
+ // Get the handle_limit.
+ int handle_limit (void);
+private:
+ int recurse_;
+ // Are we recursving.
+
+ const ACE_TCHAR *hostname_;
+ // Initial Web server name.
+
+ const ACE_TCHAR *uri_;
+ // Initial URI name.
+
+ int debug_;
+ // Are we debugging?
+
+ int verbose_;
+ // Are we being verbose?
+
+ const ACE_TCHAR *order_;
+ // Whether the URLs are traversed in FIFO or LIFO order.
+
+ ACE_Time_Value timeout_;
+ // Timeout on <recv> and <connect> to broken Web servers.
+
+ const ACE_TCHAR *url_filter_;
+ // String used to filter out which URLs to validate.
+
+ Command_Processor *command_processor_;
+ // Pointer to the Command_Processor.
+
+ URL_Visitor *visitor_;
+ // Pointer to the <URL_Visitor>.
+
+ int port_no_;
+ // Port no.
+
+ int handle_limit_;
+ // The limit of the number of descriptors to be given for this process.
+};
+
+// Typedef an Options Singleton.
+typedef ACE_Singleton <Options, ACE_Null_Mutex> OPTIONS;
+
+#endif /* _OPTIONS_H */
diff --git a/ACE/examples/Web_Crawler/README b/ACE/examples/Web_Crawler/README
new file mode 100644
index 00000000000..4f81809173d
--- /dev/null
+++ b/ACE/examples/Web_Crawler/README
@@ -0,0 +1,25 @@
+Web Crawler Kirthika Parameswaran
+-----------
+
+The Web Crawler follows the HTTP_1.1 protocol.
+
+This Crawler crawls in either FIFO or LIFO order over the URLs
+now stored in a ACE_Unbounded_Queue. The Command Processor pattern is
+used in this example.
+
+Also the auto-purging feature where connections are removed from the cache
+when the process runs out of file descriptors, is added to this example.
+
+[Use the -l option to set the handle limit].
+
+Run:
+---
+
+
+> make
+
+> main -r -u www.cs.wustl.edu/~kirthika/test.html -o LIFO
+
+or
+
+> main -r -u www.cs.wustl.edu/~kirthika/test.html -o FIFO
diff --git a/ACE/examples/Web_Crawler/URL.cpp b/ACE/examples/Web_Crawler/URL.cpp
new file mode 100644
index 00000000000..ce52ed892ad
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL.cpp
@@ -0,0 +1,39 @@
+// $Id$
+
+#include "URL.h"
+
+ACE_RCSID(Web_Crawler, URL, "$Id$")
+
+Mem_Map_Stream &
+URL::stream (void)
+{
+ return this->stream_;
+}
+
+URL::~URL (void)
+{
+}
+
+const URL_Status &
+URL::reply_status (void)
+{
+ return this->reply_status_;
+}
+
+void
+URL::reply_status (const URL_Status &rs)
+{
+ this->reply_status_ = rs;
+}
+
+const ACE_CString &
+URL::content_type (void)
+{
+ return this->content_type_;
+}
+
+void
+URL::content_type (const ACE_CString &ct)
+{
+ this->content_type_ = ct;
+}
diff --git a/ACE/examples/Web_Crawler/URL.h b/ACE/examples/Web_Crawler/URL.h
new file mode 100644
index 00000000000..68c41f018ad
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL.h
@@ -0,0 +1,82 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// URL.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _URL_H
+#define _URL_H
+
+#include "Mem_Map_Stream.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+#include "URL_Addr.h"
+#include "URL_Status.h"
+
+#include "ace/SString.h"
+
+
+// Forward declaration.
+class URL_Visitor;
+
+class URL
+{
+ // = TITLE
+ // Base class for a URL.
+ //
+ // = DESCRIPTION
+ // This class plays a role in the Visitor pattern.
+public:
+ virtual ~URL (void);
+ // Destructor.
+
+ virtual int accept (URL_Visitor *visitor) = 0;
+ // Accept the visitor, which will then perform a particular
+ // visitation strategy on the URL. This method is part of the
+ // Visitor pattern.
+
+ virtual ssize_t send_request (void) = 0;
+ // Send a <GET> command to fetch the contents in the URI from the
+ // server.
+
+ virtual const ACE_URL_Addr &url_addr (void) const = 0;
+ // Returns the URL that we represent.
+
+ virtual Mem_Map_Stream &stream (void);
+ // Returns the <Mem_Map_Stream>.
+
+ // = Get/set the reply status.
+ virtual const URL_Status &reply_status (void);
+ virtual void reply_status (const URL_Status &);
+
+ // = Get/set the reply status.
+ virtual const ACE_CString &content_type (void);
+ virtual void content_type (const ACE_CString &);
+
+
+
+private:
+ URL_Status reply_status_;
+ // Reply status of the URL.
+
+ ACE_CString content_type_;
+ // Content-type of the URL.
+
+ Mem_Map_Stream stream_;
+ // Contents of the stream.
+};
+
+#endif /* _URL_H */
diff --git a/ACE/examples/Web_Crawler/URL_Addr.cpp b/ACE/examples/Web_Crawler/URL_Addr.cpp
new file mode 100644
index 00000000000..5a630e387fb
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Addr.cpp
@@ -0,0 +1,234 @@
+// $Id$
+
+#include "URL_Addr.h"
+#include "ace/Log_Msg.h"
+#include "ace/OS_NS_string.h"
+#include "ace/OS_NS_stdio.h"
+#include "ace/OS_NS_stdlib.h"
+#include "ace/OS_Memory.h"
+
+ACE_RCSID (Web_Crawler,
+ URL_Addr,
+ "$Id$")
+
+
+ACE_URL_Addr::ACE_URL_Addr (void)
+ : path_name_ (0),
+ addr_string_ (0),
+ addr_string_len_ (0)
+{
+}
+
+int
+ACE_URL_Addr::addr_to_string (ACE_TCHAR *s,
+ size_t size,
+ int ipaddr_format) const
+{
+ const size_t total_len =
+ ACE_OS::strlen (ipaddr_format == 0 ?
+ this->get_host_name () :
+ this->get_host_addr ())
+ + ACE_OS::strlen ("65536") // Assume the max port number.
+ + ACE_OS::strlen (this->get_path_name ())
+ + sizeof (':')
+ + sizeof ('/')
+ + sizeof ('\0'); // For trailing '\0'.
+
+ if (size < total_len)
+ return -1;
+ else
+ {
+ ACE_OS::sprintf (s, ACE_TEXT ("%s:%d/%s"),
+ ACE_TEXT_CHAR_TO_TCHAR (ipaddr_format == 0
+ ? this->get_host_name ()
+ : this->get_host_addr ()),
+ this->get_port_number (),
+ this->get_path_name ());
+ return 0;
+ }
+}
+
+const ACE_TCHAR *
+ACE_URL_Addr::addr_to_string (int ipaddr_format) const
+{
+ ACE_URL_Addr *this_ptr = const_cast<ACE_URL_Addr *> (this);
+
+ size_t size =
+ ACE_OS::strlen (ipaddr_format == 0 ?
+ this->get_host_name () :
+ this->get_host_addr ())
+ + ACE_OS::strlen ("65536") // Assume the max port number.
+ + ACE_OS::strlen (this->get_path_name ())
+ + sizeof (':')
+ + sizeof ('/')
+ + sizeof ('\0'); // For trailing '\0'.
+
+ if (size > this->addr_string_len_)
+ {
+ ACE_ALLOCATOR_RETURN (this_ptr->addr_string_,
+ (ACE_TCHAR *) ACE_OS::realloc ((void *) this->addr_string_,
+ size),
+ 0);
+ this_ptr->addr_string_len_ = size;
+ }
+ ACE_OS::sprintf (this->addr_string_,
+ ACE_TEXT ("%s:%d/%s"),
+ ACE_TEXT_CHAR_TO_TCHAR (ipaddr_format == 0
+ ? this->get_host_name ()
+ : this->get_host_addr ()),
+ this->get_port_number (),
+ this->get_path_name ());
+ return this->addr_string_;
+}
+
+int
+ACE_URL_Addr::string_to_addr (const ACE_TCHAR *s)
+{
+ int result;
+ ACE_TCHAR *t;
+
+ // Need to make a duplicate since we'll be overwriting the string.
+ ACE_ALLOCATOR_RETURN (t,
+ ACE_OS::strdup (s),
+ -1);
+
+
+ // First split off the path_name.
+
+ ACE_TCHAR *path_name = ACE_OS::strchr (t, ACE_TEXT ('/'));
+ const ACE_TCHAR *name = ACE_TEXT ("index.html");
+ if (path_name != 0)
+ {
+ if (ACE_OS::strlen (path_name + 1) > 0)
+ name = path_name + 1;
+
+ *path_name = '\0';
+ }
+
+ ACE_ALLOCATOR_RETURN (this->path_name_,
+ // Skip over '/'
+ ACE_OS::strdup (name),
+ -1);
+
+ // Now handle the host address and port number.
+ ACE_TCHAR *port_number = ACE_OS::strchr (t, ':');
+
+ if (port_number == 0)
+ {
+ // Assume it's an ip-address or ip-number.
+ result = this->ACE_INET_Addr::set (ACE_DEFAULT_HTTP_PORT,
+ t);
+ }
+ else
+ {
+ *port_number = '\0';
+ u_short port = (u_short) ACE_OS::atoi (port_number + 1); // Skip over ':'
+ result = this->ACE_INET_Addr::set (port, t);
+ }
+
+ ACE_OS::free (ACE_MALLOC_T (t));
+ return result;
+}
+
+ACE_URL_Addr::ACE_URL_Addr (const ACE_URL_Addr &addr)
+ : ACE_INET_Addr (),
+ path_name_ (0),
+ addr_string_ (0),
+ addr_string_len_ (0)
+{
+ if (this->set (addr) == -1)
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("%p\n"),
+ ACE_TEXT ("ACE_URL_Addr::ACE_URL_Addr")));
+}
+
+int
+ACE_URL_Addr::set (const ACE_URL_Addr &addr)
+{
+ ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *>
+ (this->path_name_)));
+ ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *>
+ (this->addr_string_)));
+ if (this->ACE_INET_Addr::set (addr) == -1)
+ return -1;
+ else
+ {
+ if (addr.path_name_)
+ ACE_ALLOCATOR_RETURN (this->path_name_,
+ ACE_OS::strdup (addr.path_name_),
+ -1);
+ if (addr.addr_string_)
+ ACE_ALLOCATOR_RETURN (this->addr_string_,
+ ACE_OS::strdup (addr.addr_string_),
+ -1);
+ this->addr_string_len_ =
+ addr.addr_string_len_;
+ return 0;
+ }
+}
+
+void
+ACE_URL_Addr::operator= (const ACE_URL_Addr &addr)
+{
+ if (this->set (addr) == -1)
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("%p\n"),
+ ACE_TEXT ("ACE_URL_Addr::ACE_URL_Addr")));
+}
+
+u_long
+ACE_URL_Addr::hash (void) const
+{
+ u_long result = this->ACE_INET_Addr::hash ()
+ + ACE::hash_pjw (this->get_path_name ());
+
+ return result;
+}
+
+bool
+ACE_URL_Addr::operator== (const ACE_URL_Addr &addr) const
+{
+ return ACE_OS::strcmp (addr.get_path_name (),
+ this->get_path_name ()) == 0
+ && addr.get_port_number () == this->get_port_number ()
+ && addr.get_ip_address () == this->get_ip_address ();
+}
+
+bool
+ACE_URL_Addr::operator!= (const ACE_URL_Addr &addr) const
+{
+ return !(*this == addr);
+}
+
+ACE_URL_Addr::ACE_URL_Addr (const ACE_TCHAR *host_name,
+ const ACE_TCHAR *path_name,
+ u_short port)
+ : ACE_INET_Addr (port, host_name),
+ path_name_ (ACE_OS::strdup (path_name)),
+ addr_string_ (0),
+ addr_string_len_ (0)
+{
+}
+
+const ACE_TCHAR *
+ACE_URL_Addr::get_path_name (void) const
+{
+ return this->path_name_;
+}
+
+ACE_URL_Addr::~ACE_URL_Addr (void)
+{
+ ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *>
+ (this->path_name_)));
+ ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *>
+ (this->addr_string_)));
+ this->path_name_ = 0;
+}
+
+int
+ACE_URL_Addr::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
diff --git a/ACE/examples/Web_Crawler/URL_Addr.h b/ACE/examples/Web_Crawler/URL_Addr.h
new file mode 100644
index 00000000000..9792e1bb390
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Addr.h
@@ -0,0 +1,111 @@
+// -*- C++ -*-
+
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// URL_Addr.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef ACE_URL_ADDR_H
+#define ACE_URL_ADDR_H
+
+#include "ace/INET_Addr.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+#include "ace/ACE.h"
+
+class ACE_URL_Addr : public ACE_INET_Addr
+{
+ // = TITLE
+ // Defines a URL address family address format.
+public:
+ // = Initialization and termination methods.
+ ACE_URL_Addr (void);
+ // Constructor.
+
+ ACE_URL_Addr (const ACE_TCHAR *host_name,
+ const ACE_TCHAR *path_name,
+ u_short port = ACE_DEFAULT_HTTP_PORT);
+
+ ACE_URL_Addr (const ACE_URL_Addr &addr);
+ // Copy constructor.
+
+ int set (const ACE_URL_Addr &addr);
+ // Essentially the copy constructor.
+
+ virtual int string_to_addr (const ACE_TCHAR *address);
+ // Initializes an <ACE_URL_Addr> from the <address>, which can be
+ // "ip-number:port-number/path-name" (e.g.,
+ // "www.cs.wustl.edu:1234/~schmidt/" "ip-number:port-number/path-name"
+ // (e.g., "128.252.166.57:1234/~schmidt"). If there is no ':' in
+ // the <address> it is assumed to be an ip-number or ip-address
+ // number, with the port number <ACE_DEFAULT_HTTP_PORT>.
+
+ virtual int addr_to_string (ACE_TCHAR *s,
+ size_t size,
+ int ipaddr_format = 1) const;
+ // Transform the current <ACE_INET_Addr> address into string format.
+ // If <ipaddr_format> is non-0 this produces
+ // "ip-number:port-number/path-name" (e.g.,
+ // "128.252.166.57:80/~schmidt/"), whereas if <ipaddr_format> is 0
+ // this produces "ip-name:port-number" (e.g.,
+ // "www.cs.wustl.edu:80/~schmidt/"). Returns -1 if the <size> of
+ // the <buffer> is too small, else 0.
+
+ virtual const ACE_TCHAR *addr_to_string (int ipaddr_format = 1) const;
+ // Transform the current <ACE_INET_Addr> address into string format.
+ // If <ipaddr_format> is non-0 this produces
+ // "ip-number:port-number/path-name" (e.g.,
+ // "128.252.166.57:80/~schmidt/"), whereas if <ipaddr_format> is 0
+ // this produces "ip-name:port-number" (e.g.,
+ // "www.cs.wustl.edu:80/~schmidt/"). Uses dynamic memory, which
+ // is allocated on demand and deallocated when the object is
+ // destroyed. Returns -1 if dynamic memory fails, else 0.
+
+ void operator= (const ACE_URL_Addr &addr);
+ // Assignment operator.
+
+ ~ACE_URL_Addr (void);
+ // Destructor.
+
+ bool operator == (const ACE_URL_Addr &SAP) const;
+ // Compare two addresses for equality. The addresses are considered
+ // equal if they contain the same IP address, port number, and path
+ // name.
+
+ bool operator != (const ACE_URL_Addr &SAP) const;
+ // Compare two addresses for inequality.
+
+ virtual u_long hash (void) const;
+ // Computes and returns hash value.
+
+ const ACE_TCHAR *get_path_name (void) const;
+ // Return the path name.
+
+ int destroy (void);
+ // Commit suicide.
+private:
+ ACE_TCHAR *path_name_;
+ // Our path name.
+
+ ACE_TCHAR *addr_string_;
+ // The dynamically address string that's used for the
+ // <addr_to_string> method.
+
+ size_t addr_string_len_;
+ // Current length of the <addr_string_>
+};
+
+#endif /* ACE_URL_ADDR_H */
diff --git a/ACE/examples/Web_Crawler/URL_Status.cpp b/ACE/examples/Web_Crawler/URL_Status.cpp
new file mode 100644
index 00000000000..35a57420593
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Status.cpp
@@ -0,0 +1,40 @@
+/* -*- C++ -*- */
+// $Id$
+
+#include "URL_Status.h"
+
+ACE_RCSID(Web_Crawler, URL_Status, "$Id$")
+
+URL_Status::URL_Status (STATUS_CODE code)
+ : status_ (code)
+{
+}
+
+URL_Status::URL_Status (const URL_Status &s)
+ : status_ (s.status_)
+{
+}
+
+URL_Status::STATUS_CODE
+URL_Status::status (void) const
+{
+ return this->status_;
+}
+
+void
+URL_Status::status (int s)
+{
+ this->status_ = URL_Status::STATUS_CODE (s);
+}
+
+void
+URL_Status::status (URL_Status::STATUS_CODE s)
+{
+ this->status_ = s;
+}
+
+int URL_Status::destroy (void)
+{
+ delete this;
+ return 0;
+}
diff --git a/ACE/examples/Web_Crawler/URL_Status.h b/ACE/examples/Web_Crawler/URL_Status.h
new file mode 100644
index 00000000000..672c5e4f240
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Status.h
@@ -0,0 +1,61 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// URL_Status.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _URL_STATUS_H
+#define _URL_STATUS_H
+
+#include "ace/config-all.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+class URL_Status
+{
+ // = TITLE
+public:
+ enum STATUS_CODE
+ {
+ STATUS_OK = 200,
+ STATUS_CREATED = 201,
+ STATUS_ACCEPTED = 202,
+ STATUS_NO_CONTENT = 204,
+ STATUS_MOVED_PERMANENTLY = 301,
+ STATUS_MOVED_TEMPORARILY = 302,
+ STATUS_NOT_MODIFIED = 304,
+ STATUS_BAD_REQUEST = 400,
+ STATUS_UNAUTHORIZED = 401,
+ STATUS_FORBIDDEN = 403,
+ STATUS_ITEM_NOT_FOUND = 404,
+ STATUS_INTERNAL_SERVER_ERROR = 500,
+ STATUS_OP_NOT_IMPLEMENTED = 501,
+ STATUS_BAD_GATEWAY = 502,
+ STATUS_SERVICE_UNAVAILABLE = 503,
+ STATUS_INSUFFICIENT_DATA = 399
+ };
+
+ URL_Status (STATUS_CODE = STATUS_INSUFFICIENT_DATA);
+ URL_Status (const URL_Status &);
+
+ STATUS_CODE status (void) const;
+ void status (int);
+ void status (STATUS_CODE);
+ int destroy (void);
+private:
+ STATUS_CODE status_;
+};
+
+#endif /* _URL_STATUS_H */
diff --git a/ACE/examples/Web_Crawler/URL_Visitor.cpp b/ACE/examples/Web_Crawler/URL_Visitor.cpp
new file mode 100644
index 00000000000..481a7140089
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Visitor.cpp
@@ -0,0 +1,543 @@
+// $Id$
+
+#include "ace/OS_NS_string.h"
+#include "URL_Visitor.h"
+#include "Command_Processor.h"
+
+ACE_RCSID(Web_Crawler, URL_Visitor, "$Id$")
+
+URL_Processing_Strategy::URL_Processing_Strategy (URL &url,
+ URL_Iterator &iterator)
+ : url_ (url),
+ iterator_ (iterator)
+{
+}
+
+URL_Processing_Strategy::~URL_Processing_Strategy (void)
+{
+}
+
+int
+URL_Processing_Strategy::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+URL_Download_Strategy::URL_Download_Strategy (URL &url,
+ URL_Iterator &iterator)
+ : URL_Processing_Strategy (url, iterator)
+{
+}
+
+int
+URL_Download_Strategy::execute (void)
+{
+ ACE_CString buffer;
+
+ // Extract all the contents of the Stream and print them to the
+ // file.
+ while (this->iterator_.next (buffer) != 0)
+ ACE_DEBUG ((LM_DEBUG,
+ "%s",
+ buffer.c_str ()));
+
+ return 0;
+}
+
+HTTP_Header_Processing_Strategy::HTTP_Header_Processing_Strategy (URL &url,
+ URL_Iterator &iterator)
+ : URL_Processing_Strategy (url, iterator)
+{
+}
+
+int
+HTTP_Header_Processing_Strategy::execute (void)
+{
+ // Set the get() position.Necessary since later a peek is done.
+ if (this->url_.stream ().get_char () == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n","Header Not Found"),
+ -1);
+ char line_buf[BUFSIZ + 1];
+ ACE_CString line (line_buf);
+ // Get the lines in the header iteratively and check for status info.
+ int result = 1, i = 0;
+ for (i = 0, result = this->iterator_.next (line);
+ result > 0;
+ ++i, result = this->iterator_.next (line))
+ {
+ if (i == 0)
+ {
+ // Assuming that the status-no is a space away.
+ int status_index = line.find ("HTTP", 0);
+ ACE_CString status = line.substring (status_index + 9, //HTTP/1.1 200
+ 3);
+
+ URL_Status *url_status = 0;
+ ACE_NEW_RETURN (url_status,
+ URL_Status,
+ 0);
+ Auto_Destroyer<URL_Status> url_status_ptr (url_status);
+ url_status_ptr->status (ACE_OS::atoi (status.c_str ()));
+ this->url_.reply_status (**url_status_ptr);
+ // Invalid url.
+ if (url_status_ptr->status () != 200)
+ return -1;
+ }
+ else
+ {
+
+ if (line.find ("text/html") != ACE_CString::npos)
+ {
+ ACE_CString url_content_type("text/html");
+ this->url_.content_type (url_content_type);
+ }
+ }
+ }
+ return 0;
+
+}
+
+HTML_Body_Validation_Strategy::HTML_Body_Validation_Strategy (URL &url,
+ URL_Iterator &iterator,
+ URL_Validation_Visitor &context)
+ : URL_Processing_Strategy (url, iterator),
+ visitor_context_ (context)
+{
+}
+
+int
+HTML_Body_Validation_Strategy::execute (void)
+{
+ char host_name_buf[BUFSIZ + 1];
+ ACE_CString host_name (host_name_buf);
+ host_name.set (url_.url_addr ().get_host_name (),1);
+
+ // All to facilitate relative paths
+ char temp[BUFSIZ + 1];
+ ACE_CString prev_location (temp);
+
+ prev_location.set (ACE_TEXT_ALWAYS_CHAR (this->url_.url_addr ().get_path_name ()),
+ ACE_OS::strlen (this->url_.url_addr ().get_path_name ()),
+ 1);
+ int index = prev_location.rfind ('/', prev_location.length ());
+ ACE_CString str = prev_location.substring (0, index + 1);
+ prev_location.set (str.c_str (), 1);
+
+ // Note: prev_location always ends with '/'
+ if (prev_location[0] != '/')
+ prev_location = "/" + prev_location;
+
+ // Build the url portion which can be attached to teh relative paths.
+ prev_location = host_name + prev_location;
+
+ char url_string[BUFSIZ + 1];
+ ACE_CString url (url_string);
+
+ while (this->iterator_.next (url) > 0)
+ {
+ // Check for relative urls.Strip out "http://" if its there.
+ if (url.find ("http") == url.npos)
+ {
+ if (url[0] == '.' && url[1] == '.')
+ {
+ url.set (&url[3], 1);
+ int i = prev_location.rfind ('/', prev_location.length () - 1);
+ prev_location = prev_location.substring (0, i+1);
+ }
+ if (url[0] == '.' && url[1] == '/')
+ url.set (&url[2], 1);
+
+ url = prev_location + url;
+ }
+ else
+ url.set (&url[7], 1);
+ // Double slash at the end works!e.g www.cs.wustl.edu/~kirthika//
+ if (url.find (".html") == url.npos)
+ url = url + "/";
+
+ // Create the new URL address.
+ ACE_URL_Addr *url_addr;
+ ACE_NEW_RETURN (url_addr,
+ ACE_URL_Addr,
+ 0);
+ Auto_Destroyer<ACE_URL_Addr> url_addr_ptr (url_addr);
+ if (url_addr_ptr->string_to_addr (ACE_TEXT_CHAR_TO_TCHAR (url.c_str ())) == 0)
+ {
+ HTTP_URL *http_url;
+ ACE_NEW_RETURN (http_url,
+ HTTP_URL (**url_addr_ptr,
+ dynamic_cast<HTTP_URL *> (&this->url_)),
+ 0);
+ URL_Command *url_command;
+ ACE_NEW_RETURN (url_command,
+ URL_Command (http_url),
+ 0);
+
+ OPTIONS::instance ()->command_processor ()->insert (url_command);
+ }
+ }
+ return 0;
+}
+
+URL_Iterator *
+URL_Validation_Visitation_Strategy_Factory::make_header_iterator (void)
+{
+ URL_Iterator *i;
+ ACE_NEW_RETURN (i,
+ HTTP_Header_Iterator (*this->url_),
+ 0);
+ return i;
+}
+
+URL_Iterator *
+URL_Validation_Visitation_Strategy_Factory::make_body_iterator (void)
+{
+ URL_Iterator *i;
+ ACE_NEW_RETURN (i,
+ HTML_Body_Iterator (*this->url_),
+ 0);
+ return i;
+}
+
+URL_Processing_Strategy *
+URL_Validation_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator)
+{
+ URL_Processing_Strategy *ps;
+ ACE_NEW_RETURN (ps,
+ HTTP_Header_Processing_Strategy (*this->url_,
+ iterator),
+ 0);
+ return ps;
+}
+
+URL_Processing_Strategy *
+URL_Validation_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator)
+{
+ URL_Processing_Strategy *ps;
+ ACE_NEW_RETURN (ps,
+ HTML_Body_Validation_Strategy (*this->url_,
+ iterator,
+ this->visitor_context_),
+ 0);
+ return ps;
+}
+
+int
+URL_Validation_Visitation_Strategy_Factory::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+URL_Visitor::~URL_Visitor (void)
+{
+}
+
+URL_Validation_Visitor::URL_Validation_Visitor (void)
+{
+ ACE_NEW (this->caching_connect_strategy_,
+ CACHED_CONNECT_STRATEGY (this->caching_strategy_));
+ ACE_NEW (this->strat_connector_,
+ STRATEGY_CONNECTOR(0,
+ &creation_strategy_,
+ caching_connect_strategy_,
+ &activation_strategy_));
+ if (strat_connector_ == 0)
+ ACE_ERROR ((LM_ERROR,
+ "%p %s\n"
+ "strategy connector creation failed"));
+
+
+}
+
+URL_Validation_Visitor::~URL_Validation_Visitor (void)
+{
+ this->strat_connector_ = 0;
+ if (this->caching_connect_strategy_ != 0)
+ delete this->caching_connect_strategy_;
+}
+
+URL_Validation_Visitor::URL_CACHE &
+URL_Validation_Visitor::url_cache (void)
+{
+ return this->url_cache_;
+}
+
+int
+URL_Validation_Visitor::in_cache (const ACE_URL_Addr &url_addr)
+{
+ URL_Status reply_status (URL_Status::STATUS_CODE (1));
+
+ if (this->url_cache_.find (url_addr, reply_status) == 0)
+ {
+ ACE_DEBUG ((LM_DEBUG,
+ "status %d for URL %s (cached)\n",
+ reply_status.status (),
+ url_addr.addr_to_string (0)));
+
+ // Invalid status.
+ if (reply_status.status () != 200)
+ return -1;
+
+ return 1;
+ }
+ else
+ return 0;
+}
+
+URL_Visitation_Strategy_Factory *
+URL_Validation_Visitor::make_visitation_strategy_factory (URL &url)
+{
+ // Since this is HTTP 1.1 we'll need to establish a connection
+ // only once. Trying for relative paths.
+
+ if (url.stream ().open (this->strat_connector_,
+ url.url_addr ()) == -1)
+ return 0;
+
+ // See if we can get connected and send the GET request via the
+ // <HTTP_URL>.
+ int result = url.send_request ();
+ if (result == -1)
+ {
+ ACE_ERROR ((LM_ERROR,
+ "%p\n",
+ "send_request"));
+ if (this->url_cache_.bind (url.url_addr (),
+ URL_Status (URL_Status::STATUS_SERVICE_UNAVAILABLE)) == -1)
+ ACE_ERROR ((LM_ERROR,
+ "%p\n",
+ "bind"));
+ return 0;
+ }
+ // @@ Here's where we could check to see if the <url> was HTTP or
+ // FTP, etc. But for now we'll just assume that everything is an
+ // HTTP URL.
+ else
+ {
+
+ URL_Visitation_Strategy_Factory *vs;
+ ACE_NEW_RETURN (vs,
+ URL_Validation_Visitation_Strategy_Factory (&url,
+ *this),
+ 0);
+ return vs;
+ }
+}
+
+int
+URL_Validation_Visitor::destroy (void)
+{
+ delete this->strat_connector_;
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+int
+URL_Validation_Visitor::visit (HTTP_URL &http_url)
+{
+ int result = this->in_cache (http_url.url_addr ());
+ if (result == 0)
+ {
+ Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url));
+
+ if (*vs == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_visitation_strategy_factory"),
+ -1);
+
+ Auto_Destroyer <URL_Iterator> ihs (vs->make_header_iterator ());
+ if (*ihs == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_header_iterator"),
+ -1);
+ Auto_Destroyer <URL_Processing_Strategy> phs (vs->make_header_strategy (**ihs));
+ if (*phs == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_header_strategy"),
+ -1);
+ int phs_result = phs->execute ();
+ if (phs_result == -1)
+ ACE_DEBUG ((LM_DEBUG,
+ "Invalid "));
+
+ ACE_DEBUG ((LM_DEBUG,
+ "URL with status %d %s\n",
+ http_url.reply_status ().status (),
+ http_url.url_addr().addr_to_string (0)));
+
+ // Store the http url in the cache.
+ if (this->url_cache ().bind (http_url.url_addr (),
+ http_url.reply_status ()) != 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n","url_cache.bind"),
+ -1);
+
+ // Since it is invalid dont go further.
+ if (phs_result == -1)
+ return 0;
+
+ // Get back if the recurse option isnt set.
+ if (OPTIONS::instance ()->recurse () != 1)
+ return 0;
+
+ Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ());
+ if (*is == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_body_iterator"),
+ -1);
+
+ Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is));
+ if (*ps == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_body_strategy"),
+ -1);
+
+ if (ps->execute () == -1)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "body execute"),
+ -1);
+
+ }
+ return 0;
+}
+
+int
+URL_Download_Visitation_Strategy_Factory::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+URL_Iterator *
+URL_Download_Visitation_Strategy_Factory::make_header_iterator (void)
+{
+ return 0;
+}
+
+URL_Iterator *
+URL_Download_Visitation_Strategy_Factory::make_body_iterator (void)
+{
+ URL_Iterator *i;
+ ACE_NEW_RETURN (i,
+ URL_Download_Iterator (*this->url_),
+ 0);
+ return i;
+}
+
+URL_Processing_Strategy *
+URL_Download_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator)
+{
+ // You fill in here.
+ ACE_UNUSED_ARG (iterator);
+
+ return 0;
+}
+
+URL_Processing_Strategy *
+URL_Download_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator)
+{
+ URL_Processing_Strategy *ps;
+ ACE_NEW_RETURN (ps,
+ URL_Download_Strategy (*this->url_,
+ iterator),
+ 0);
+ return ps;
+}
+
+URL_Visitation_Strategy_Factory::URL_Visitation_Strategy_Factory (URL *url)
+ : url_ (url)
+{
+}
+
+URL_Visitation_Strategy_Factory::~URL_Visitation_Strategy_Factory (void)
+{
+}
+
+URL_Download_Visitation_Strategy_Factory::URL_Download_Visitation_Strategy_Factory (URL *url)
+ : URL_Visitation_Strategy_Factory (url)
+{
+}
+
+URL_Validation_Visitation_Strategy_Factory::URL_Validation_Visitation_Strategy_Factory (URL *url,
+ URL_Validation_Visitor &visitor_context)
+ : URL_Visitation_Strategy_Factory (url),
+ visitor_context_ (visitor_context)
+{
+}
+
+URL_Visitation_Strategy_Factory *
+URL_Download_Visitor::make_visitation_strategy_factory (URL &url)
+{
+ // See if we can get connected and send the GET request via the
+ // <HTTP_URL>.
+ while (1)
+ {
+ int retval = url.send_request ();
+ if (retval != -1)
+ break;
+
+ }
+ // @@ Here's where we could check to see if the <url> was HTTP or
+ // FTP, etc. But for now we'll just assume that everything is an
+ // HTTP URL.
+ URL_Visitation_Strategy_Factory *vs;
+ ACE_NEW_RETURN (vs,
+ URL_Download_Visitation_Strategy_Factory (&url),
+ 0);
+ return vs;
+
+}
+
+int
+URL_Download_Visitor::destroy (void)
+{
+ // Commit suicide.
+ delete this;
+ return 0;
+}
+
+int
+URL_Download_Visitor::visit (HTTP_URL &http_url)
+{
+ Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url));
+
+ if (*vs == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_visitation_strategy_factory"),
+ -1);
+
+ Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ());
+ if (*is == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_body_iterator"),
+ -1);
+
+ Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is));
+ if (*ps == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_body_strategy"),
+ -1);
+
+ if (ps->execute () == -1)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "body execute"),
+ -1);
+ return 0;
+}
diff --git a/ACE/examples/Web_Crawler/URL_Visitor.h b/ACE/examples/Web_Crawler/URL_Visitor.h
new file mode 100644
index 00000000000..9f68612d629
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Visitor.h
@@ -0,0 +1,436 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// URL_Visitor.h
+//
+// = AUTHOR
+// Douglas C.Schmidt <schmidt@cs.wustl.edu>
+// Kirthika Parameswaran <kirthika@cs.wustl.edu>
+// ============================================================================
+
+#ifndef _URL_VISITOR_H
+#define _URL_VISITOR_H
+#include /**/ "ace/pre.h"
+
+#include "ace/Strategies_T.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+
+#include "HTTP_URL.h"
+#include "Iterators.h"
+#include "ace/Hash_Map_Manager_T.h"
+#include "ace/Caching_Strategies_T.h"
+#include "ace/Cached_Connect_Strategy_T.h"
+#include "Options.h"
+#include "ace/Pair_T.h"
+
+// Forward declarations.
+class URL_Validation_Visitor;
+
+class URL_Processing_Strategy
+{
+ // = TITLE
+ // Abstract base class for the URL processing strategy.
+ //
+ // = DESCRIPTION
+public:
+ URL_Processing_Strategy (URL &,
+ URL_Iterator &);
+ // Constructor.
+
+ virtual ~URL_Processing_Strategy (void);
+
+ virtual int execute (void) = 0;
+ // Perform the strategy.
+
+ virtual int destroy (void);
+
+ // Close down the resources.
+
+protected:
+ URL &url_;
+ // A reference to the URL "context" that we're processing.
+
+ URL_Iterator &iterator_;
+ // Iterator for the URL that we're processing.
+};
+
+class HTTP_Header_Processing_Strategy : public URL_Processing_Strategy
+{
+ // = TITLE
+ // Defines the HTTP header processing strategy.
+ //
+ // = DESCRIPTION
+public:
+ HTTP_Header_Processing_Strategy (URL &,
+ URL_Iterator &);
+ // Constructor.
+
+ virtual int execute (void);
+ // Perform the strategy for processing an HTTP header.
+};
+
+class HTML_Body_Validation_Strategy : public URL_Processing_Strategy
+{
+ // = TITLE
+ // Defines the HTML body processing strategy.
+ //
+ // = DESCRIPTION
+ // This class iterates through the body of an HTML file and
+ // recursively visits embedded links.
+public:
+ HTML_Body_Validation_Strategy (URL &,
+ URL_Iterator &,
+ URL_Validation_Visitor &);
+ // Constructor.
+
+ virtual int execute (void);
+ // Perform the strategy for processing an HTML file. This strategy
+ // iterates over the HTML file and recursively visits embedded links
+ // to process them, as well.
+
+private:
+ URL_Validation_Visitor &visitor_context_;
+ // This is the context of the visit.
+};
+
+class URL_Download_Strategy : public URL_Processing_Strategy
+{
+ // = TITLE
+ // Defines a URL downloading strategy.
+ //
+ // = DESCRIPTION
+ // This class downloads a URL's contents into a temporary file.
+public:
+ URL_Download_Strategy (URL &,
+ URL_Iterator &);
+ // Constructor.
+
+ virtual int execute (void);
+ // Perform the strategy for downloading a URL to a temporary file.
+};
+
+class URL_Visitation_Strategy_Factory
+{
+ // = TITLE
+ // Abstract Factory for the URL visitation strategy.
+ //
+ // = DESCRIPTION
+public:
+ URL_Visitation_Strategy_Factory (URL *);
+
+ /// Destructor.
+ virtual ~URL_Visitation_Strategy_Factory (void);
+
+ // = Factory Methods.
+ virtual URL_Iterator *make_header_iterator (void) = 0;
+ // Factory Method that makes the header iterator.
+
+ virtual URL_Iterator *make_body_iterator (void) = 0;
+ // Factory Method that makes the body iterator.
+
+ virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &) = 0;
+ // Factory Method that makes the header processing strategy.
+
+ virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &) = 0;
+ // Factory Method that makes the body processing strategy .
+
+ virtual int destroy (void) = 0;
+ // Close down the resources.
+
+protected:
+ URL *url_;
+ // Stash the URL so we don't have to pass it around.
+};
+
+class URL_Download_Visitation_Strategy_Factory : public URL_Visitation_Strategy_Factory
+{
+ // = TITLE
+ // Concrete Factory for the URL validation visitation strategy.
+ //
+ // = DESCRIPTION
+public:
+ URL_Download_Visitation_Strategy_Factory (URL *);
+ // Constructor.
+
+ // = Factory Methods.
+ virtual URL_Iterator *make_header_iterator (void);
+ // Factory Method that makes an <HTTP_Header_Iterator>.
+
+ virtual URL_Iterator *make_body_iterator (void);
+ // Factory Method that makes an <HTML_Body_Iterator>.
+
+ virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &);
+ // Factory Method that makes the header processing strategy.
+
+ virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &);
+ // Factory Method that makes the body processing strategy .
+
+ virtual int destroy (void);
+ // Close down the resources.
+};
+
+class URL_Validation_Visitation_Strategy_Factory : public URL_Visitation_Strategy_Factory
+{
+ // = TITLE
+ // Concrete Factory for the URL validation visitation strategy.
+ //
+ // = DESCRIPTION
+public:
+ URL_Validation_Visitation_Strategy_Factory (URL *,
+ URL_Validation_Visitor &);
+ // Constructor.
+
+ // = Factory Methods.
+ virtual URL_Iterator *make_header_iterator (void);
+ // Factory Method that makes an <HTTP_Header_Iterator>.
+
+ virtual URL_Iterator *make_body_iterator (void);
+ // Factory Method that makes an <HTML_Body_Iterator>.
+
+ virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &);
+ // Factory Method that makes the header processing strategy.
+
+ virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &);
+ // Factory Method that makes the body processing strategy .
+
+ virtual int destroy (void);
+ // Close down the resources.
+
+private:
+ URL_Validation_Visitor &visitor_context_;
+ // Context of the visitor.
+};
+
+class URL_Visitor
+{
+ // = TITLE
+ // Base class for the URL Visitor.
+ //
+ // = DESCRIPTION
+ // This class plays the "visitor" role in the Visitor pattern.
+public:
+
+ virtual ~URL_Visitor (void);
+
+ virtual int visit (HTTP_URL &http_url) = 0;
+ // Visit an <HTTP_URL>.
+
+ // @@
+ // virtual int visit (FTP_URL &http_url) = 0;
+
+ virtual int destroy (void) = 0;
+ // Cleanup the resources.
+
+protected:
+ virtual URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &) = 0;
+ // Make the appropriate <URL_Visitation_Strategy_Factory>.
+};
+
+typedef int ATTRIBUTES;
+typedef ACE_Svc_Handler <ACE_SOCK_STREAM, ACE_NULL_SYNCH>
+ Client_Svc_Handler;
+typedef ACE_Pair<Client_Svc_Handler *, ATTRIBUTES>
+ CACHED_HANDLER;
+typedef ACE_Refcounted_Hash_Recyclable<ACE_INET_Addr>
+ ACE_ADDR;
+typedef ACE_Hash<ACE_ADDR> H_KEY;
+typedef ACE_Equal_To<ACE_ADDR> C_KEYS;
+
+typedef ACE_Hash_Map_Manager_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex>
+ HASH_MAP;
+typedef ACE_Hash_Map_Iterator_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex>
+ HASH_MAP_ITERATOR;
+typedef ACE_Hash_Map_Reverse_Iterator_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex>
+ HASH_MAP_REVERSE_ITERATOR;
+
+typedef ACE_Recyclable_Handler_Cleanup_Strategy<ACE_ADDR, CACHED_HANDLER, HASH_MAP>
+ CLEANUP_STRATEGY;
+typedef ACE_Recyclable_Handler_Caching_Utility<ACE_ADDR, CACHED_HANDLER, HASH_MAP, HASH_MAP_ITERATOR, ATTRIBUTES>
+ CACHING_UTILITY;
+
+typedef ACE_LRU_Caching_Strategy<ATTRIBUTES, CACHING_UTILITY>
+ LRU_CACHING_STRATEGY;
+
+typedef LRU_CACHING_STRATEGY
+ CACHING_STRATEGY;
+
+typedef ACE_Strategy_Connector<Client_Svc_Handler, ACE_SOCK_CONNECTOR>
+ STRATEGY_CONNECTOR;
+
+typedef ACE_NOOP_Creation_Strategy<Client_Svc_Handler>
+ NULL_CREATION_STRATEGY;
+
+typedef ACE_NOOP_Concurrency_Strategy<Client_Svc_Handler>
+ NULL_ACTIVATION_STRATEGY;
+
+typedef ACE_Cached_Connect_Strategy_Ex<Client_Svc_Handler, ACE_SOCK_CONNECTOR, CACHING_STRATEGY, ATTRIBUTES, ACE_SYNCH_NULL_MUTEX>
+ CACHED_CONNECT_STRATEGY;
+
+class URL_Validation_Visitor : public URL_Visitor
+{
+ // = TITLE
+ // Subclass that defines the URL validation visitor.
+ //
+ // = DESCRIPTION
+ // This class checks to make sure that the <HTTP_URL> is valid.
+ // If the <HTTP_URL> is an <HTML> file, it can also be used to
+ // recursively check that all embedded links in this file are
+ // valid.
+public:
+ typedef ACE_Hash_Map_Manager <ACE_URL_Addr, URL_Status, ACE_Null_Mutex>
+ URL_CACHE;
+
+ virtual int visit (HTTP_URL &http_url);
+ // Visit an <HTTP_URL> to make sure that it's valid. If the content
+ // type of the <HTTP_URL> is "text/html" and the <recursion> option
+ // is enabled then <visit> recursively checks each link embedded in
+ // the HTML page.
+
+ // @@
+ // virtual int visit (FTP_URL &http_url);
+
+ URL_Validation_Visitor (void);
+ virtual int destroy (void);
+ // Cleanup the resources.
+
+ URL_CACHE &url_cache (void);
+ // Returns a reference to the URL cache.
+
+
+ /*
+
+
+ typedef ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_NULL_SYNCH>
+ Svc_Handler;
+ typedef ACE_Strategy_Connector<Svc_Handler, ACE_SOCK_CONNECTOR>
+ STRAT_CONNECTOR;
+ typedef ACE_Refcounted_Hash_Recyclable<ACE_INET_Addr>
+ REFCOUNTED_HASH_RECYCLABLE_ADDRESS;
+ typedef ACE_NOOP_Creation_Strategy<Svc_Handler>
+ NULL_CREATION_STRATEGY;
+ typedef ACE_NOOP_Concurrency_Strategy<Svc_Handler>
+ NULL_ACTIVATION_STRATEGY;
+
+ typedef ACE_Hash_Map_Manager_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\
+ ACE_Pair<Svc_Handler *, int>,\
+ ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \
+ ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\
+ ACE_Null_Mutex>
+ CONNECTION_HASH_MAP;
+ typedef ACE_Hash_Map_Iterator_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\
+ ACE_Pair<Svc_Handler *, int>,\
+ ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \
+ ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\
+ ACE_Null_Mutex>
+ CONNECTION_HASH_MAP_ITERATOR;
+ typedef ACE_Hash_Map_Reverse_Iterator_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\
+ ACE_Pair<Svc_Handler *, int>,\
+ ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \
+ ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\
+ ACE_Null_Mutex>
+ CONNECTION_HASH_MAP_REVERSE_ITERATOR;
+ typedef ACE_Pair_Caching_Utility <REFCOUNTED_HASH_RECYCLABLE_ADDRESS, \
+ ACE_Pair<Svc_Handler *, int>, \
+ CONNECTION_HASH_MAP, CONNECTION_HASH_MAP_ITERATOR, int >
+ CACHING_STRATEGY_UTILITY;
+ typedef ACE_LRU_Caching_Strategy<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\
+ ACE_Pair<Svc_Handler *, int>,\
+ CONNECTION_HASH_MAP, int,\
+ CACHING_STRATEGY_UTILITY >
+ LRU;
+ typedef ACE_Cached_Connect_Strategy_Ex<Svc_Handler,ACE_SOCK_CONNECTOR, LRU,int, ACE_SYNCH_NULL_MUTEX>
+ CACHED_CONNECT_STRATEGY;
+ */
+protected:
+ virtual ~URL_Validation_Visitor (void);
+ virtual URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &);
+ // Factory Method that makes a
+ // <URL_Validation_Visitation_Strategy_Factory>.
+
+ URL_CACHE url_cache_;
+ // Cache the status of URLs we've already validated.
+
+ int in_cache (const ACE_URL_Addr &url_addr);
+ // Check to see if the reply status of this <url_addr> is in the
+ // cache. Returns 1 if so, 0 if not.
+
+ NULL_CREATION_STRATEGY creation_strategy_;
+ NULL_ACTIVATION_STRATEGY activation_strategy_;
+
+ // Configure the Strategy Connector with a strategy that caches
+ // connection.
+ CACHED_CONNECT_STRATEGY *caching_connect_strategy_;
+
+ STRATEGY_CONNECTOR *strat_connector_;
+
+ CACHING_STRATEGY caching_strategy_;
+};
+
+
+class URL_Download_Visitor : public URL_Visitor
+{
+ // = TITLE
+ // Subclass for the URL validtion visitor.
+ //
+ // = DESCRIPTION
+ // This class checks to make sure that the <HTTP_URL> is valid.
+public:
+ virtual int visit (HTTP_URL &http_url);
+ // Visit an <HTTP_URL> to make sure that it's valid. If the content
+ // type of the <HTTP_URL> is "text/html" and the <recursion> option
+ // is enabled then <visit> recursively checks each link embedded in
+ // the HTML page.
+
+ // @@
+ // virtual int visit (FTP_URL &http_url);
+
+ virtual int destroy (void);
+ // Cleanup the resources.
+
+protected:
+ URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &);
+ // Factory Method that makes a <URL_Download_Visitation_Strategy_Factory>.
+};
+
+template <class T>
+class Auto_Destroyer
+{
+ // = TITLE
+ // Simple class that ensures the <destroy> method is called on our
+ // <URL_*> objects when they go out of scope.
+ //
+ // = DESCRIPTION
+ // This class is similar to an auto_ptr<> and should be used to
+ // simplify blocks of code that must create/destroy pointers to
+ // various <URL_*> related strategies and iterators.
+public:
+ Auto_Destroyer (T *t): t_ (t) {}
+ T *operator-> (void) { return this->t_; }
+ T *operator *(void) { return this->t_; }
+ void operator= (T *t)
+ {
+ if (this->t_ != 0)
+ this->t_->destroy ();
+ this->t_ = t;
+ }
+ ~Auto_Destroyer (void)
+ {
+ if (this->t_ != 0)
+ t_->destroy ();
+ }
+private:
+ T *t_;
+};
+
+#include /**/ "ace/post.h"
+#endif /* _URL_VISITOR_H */
diff --git a/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp b/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp
new file mode 100644
index 00000000000..1b8a316b219
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp
@@ -0,0 +1,53 @@
+/* -*- C++ -*- */
+// $Id$
+
+#include "URL_Visitor_Factory.h"
+
+ACE_RCSID (Web_Crawler,
+ URL_Visitor_Factory,
+ "$Id$")
+
+
+URL_Visitor_Factory::~URL_Visitor_Factory (void)
+{
+}
+
+URL_Visitor *
+URL_Validation_Visitor_Factory::make_visitor (void)
+{
+ URL_Visitor *v;
+
+ ACE_NEW_RETURN (v,
+ URL_Validation_Visitor,
+ 0);
+
+ return v;
+}
+
+Command_Processor *
+URL_Validation_Visitor_Factory::make_command_processor (void)
+{
+ Command_Processor *cp;
+
+ ACE_NEW_RETURN (cp,
+ Command_Processor,
+ 0);
+ return cp;
+}
+
+URL_Visitor *
+URL_Download_Visitor_Factory::make_visitor (void)
+{
+ URL_Visitor *v;
+
+ ACE_NEW_RETURN (v,
+ URL_Download_Visitor,
+ 0);
+ return v;
+}
+
+Command_Processor *
+URL_Download_Visitor_Factory::make_command_processor (void)
+{
+ return 0;
+}
diff --git a/ACE/examples/Web_Crawler/URL_Visitor_Factory.h b/ACE/examples/Web_Crawler/URL_Visitor_Factory.h
new file mode 100644
index 00000000000..9f484afe9f0
--- /dev/null
+++ b/ACE/examples/Web_Crawler/URL_Visitor_Factory.h
@@ -0,0 +1,74 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// URL_Visitor_Factory.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _URL_VISITOR_FACTORY_H
+#define _URL_VISITOR_FACTORY_H
+
+#include "URL_Visitor.h"
+#include "Command_Processor.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+class URL_Visitor_Factory
+{
+ // = TITLE
+ // Abstract base class that creates URL visitors.
+ //
+ // = DESCRIPTION
+ // Subclasses define each of the Factory Methods to
+ // make the right objects, which all "vary" together.
+public:
+
+ /// Destructor.
+ virtual ~URL_Visitor_Factory (void);
+
+ virtual URL_Visitor *make_visitor (void) = 0;
+ // Factory Method that makes the appropriate type of <URL_Visitor>.
+
+ virtual Command_Processor *make_command_processor (void) = 0;
+ // Factory Method that makes the appropriate type of
+ // <Command_Processor>.
+};
+
+class URL_Validation_Visitor_Factory : public URL_Visitor_Factory
+{
+ // = TITLE
+ // Create a URL visitor that validates URL links.
+public:
+ virtual URL_Visitor *make_visitor (void);
+ // Factory Method that makes a <URL_Validation_Visitor>.
+
+ virtual Command_Processor *make_command_processor (void);
+ // Factory Method that makes a <FIFO_Command_Processor>.
+
+
+};
+
+class URL_Download_Visitor_Factory : public URL_Visitor_Factory
+{
+ // = TITLE
+ // Create a URL visitor that downloads URL links.
+public:
+ virtual URL_Visitor *make_visitor (void);
+ // Factory Method that makes a <URL_Download_Visitor>.
+
+ virtual Command_Processor *make_command_processor (void);
+ // Factory Method that makes a <FIFO_Command_Processor>.
+};
+
+#endif /* _URL_VISITOR_FACTORY_H */
diff --git a/ACE/examples/Web_Crawler/Web_Crawler.cpp b/ACE/examples/Web_Crawler/Web_Crawler.cpp
new file mode 100644
index 00000000000..16639a38d73
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Web_Crawler.cpp
@@ -0,0 +1,95 @@
+// $Id$
+
+#include "Options.h"
+#include "URL_Visitor_Factory.h"
+#include "Web_Crawler.h"
+
+ACE_RCSID(Web_Crawler, Web_Crawler, "$Id$")
+
+Web_Crawler::~Web_Crawler (void)
+{
+ delete this->url_visitor_factory_;
+}
+
+Web_Crawler::Web_Crawler (void)
+ : url_visitor_factory_ (0)
+{
+}
+
+int
+Web_Crawler::open (int argc, ACE_TCHAR *argv[])
+{
+ if (OPTIONS::instance ()->parse_args (argc, argv) == -1)
+ return -1;
+ // @@ Put the ACE_Service_Config::open() stuff here somewhere...
+ else
+ {
+ // For now just hardcode this to create "validation" visitors.
+ ACE_NEW_RETURN (this->url_visitor_factory_,
+ URL_Validation_Visitor_Factory,
+ -1);
+ return 0;
+ }
+}
+
+int
+Web_Crawler::run (void)
+{
+ // Make the appropriate <URL_Visitor>.
+ Auto_Destroyer<URL_Visitor> visitor (this->url_visitor_factory_->make_visitor ());
+
+ if (*visitor == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_visitor"),
+ -1);
+
+ // Make the appropriate <Command_Processor>.
+ Auto_Destroyer<Command_Processor> cp (this->url_visitor_factory_->make_command_processor ());
+
+ if (*cp == 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n",
+ "make_command_processor"),
+ -1);
+
+ // Set the <Command_Processor> in the <Options> to make it visible.
+ OPTIONS::instance ()->command_processor (*cp);
+
+ // Set the <URL_Visitor> in the <Options> to make it visible.
+ OPTIONS::instance ()->visitor (*visitor);
+
+ // @@ You fill in here...
+ ACE_URL_Addr *url_addr;
+ ACE_NEW_RETURN (url_addr,
+ ACE_URL_Addr (OPTIONS::instance()->hostname (),
+ OPTIONS::instance()->path_name (),
+ OPTIONS::instance()->port_no ()), //KIRTHIKA
+ 0);
+ Auto_Destroyer<ACE_URL_Addr> url_addr_ptr (url_addr);
+
+ HTTP_URL *http_url;
+ ACE_NEW_RETURN (http_url,
+ HTTP_URL (**url_addr_ptr),
+ 0);
+
+ Auto_Destroyer<HTTP_URL> http_url_ptr (http_url);
+
+ URL_Command *url_command;
+ ACE_NEW_RETURN (url_command,
+ URL_Command (*http_url_ptr),
+ 0);
+ // Auto_Destroyer<URL_Command> url_command_ptr (url_command);
+
+ if (cp->insert (url_command) != 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n", "insert"),
+ -1);
+
+ if (cp->execute () != 0)
+ ACE_ERROR_RETURN ((LM_ERROR,
+ "%p\n", "execute"),
+ -1);
+ return 0;
+}
+
diff --git a/ACE/examples/Web_Crawler/Web_Crawler.h b/ACE/examples/Web_Crawler/Web_Crawler.h
new file mode 100644
index 00000000000..01e275e2187
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Web_Crawler.h
@@ -0,0 +1,62 @@
+/* -*- C++ -*- */
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// Web_Crawler.h
+//
+// = AUTHOR
+// Douglas C. Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#ifndef _WEB_CRAWLER_H
+#define _WEB_CRAWLER_H
+
+#include "URL_Addr.h"
+#include "HTTP_URL.h"
+
+#if !defined (ACE_LACKS_PRAGMA_ONCE)
+#pragma once
+#endif /* ACE_LACKS_PRAGMA_ONCE */
+
+// Forward declaration.
+class URL_Visitor_Factory;
+
+class Web_Crawler
+{
+ // = TITLE
+ // An abstraction for a Web Crawler.
+ //
+ // = DESCRIPTION
+ // This class is a Facade that organizes the other classes in the
+ // solution, which include a factory that creates a visitor,
+ // which in turn embodies the appropriate visitation strategy.
+public:
+ // = Initialization and termination methods.
+ Web_Crawler (void);
+ // Constructor.
+
+ ~Web_Crawler (void);
+ // Destructor.
+
+ int open (int argc, ACE_TCHAR *argv[]);
+ // Parses the command-line options and initializes the
+ // <URL_Visitor_Factory>.
+
+ int run (void);
+ // Run the Web Crawler and carries out whatever visitation strategy
+ // is configured. Returns -1 on failure and 0 on success.
+
+private:
+ URL_Visitor_Factory *url_visitor_factory_;
+ // Pointer to a factory that creates visitors that explore URLs and
+ // perform various tasks. Subclasses of <URL_Visitor_Factory>
+ // determine what happens during a visitation.
+};
+
+#endif /* _WEB_CRAWLER_H */
diff --git a/ACE/examples/Web_Crawler/Web_Crawler.mpc b/ACE/examples/Web_Crawler/Web_Crawler.mpc
new file mode 100644
index 00000000000..7750d7cbd5d
--- /dev/null
+++ b/ACE/examples/Web_Crawler/Web_Crawler.mpc
@@ -0,0 +1,7 @@
+// -*- MPC -*-
+// $Id$
+
+project : aceexe {
+ avoids += ace_for_tao
+ exename = main
+}
diff --git a/ACE/examples/Web_Crawler/main.cpp b/ACE/examples/Web_Crawler/main.cpp
new file mode 100644
index 00000000000..1735f811b78
--- /dev/null
+++ b/ACE/examples/Web_Crawler/main.cpp
@@ -0,0 +1,51 @@
+// $Id$
+
+// ============================================================================
+//
+// = LIBRARY
+// examples/Web_Crawler
+//
+// = FILENAME
+// main.cpp
+//
+// = DESCRIPTION
+// This program implements a Web crawler that can be configured to
+// apply various strategies to URLs that it visits.
+//
+// = AUTHOR
+// Doug Schmidt <schmidt@cs.wustl.edu>
+//
+// ============================================================================
+
+#include "ace/OS_main.h"
+#include "ace/Signal.h"
+#include "Web_Crawler.h"
+#include "Options.h"
+
+ACE_RCSID(Web_Crawler, main, "$Id$")
+
+void sig_handler (int)
+{
+ ACE_DEBUG ((LM_DEBUG,
+ ACE_TEXT ("aborting!\n")));
+ ACE_OS::abort ();
+}
+
+int
+ACE_TMAIN (int argc, ACE_TCHAR *argv[])
+{
+#if !defined (ACE_HAS_WINCE)
+ ACE_Sig_Action sa ((ACE_SignalHandler) sig_handler, SIGFPE);
+#endif
+ Web_Crawler crawler;
+
+ if (crawler.open (argc, argv) == -1)
+ return 1;
+ else if (crawler.run () == -1)
+ return 1;
+ else
+ return 0;
+}
+
+
+