diff options
author | William R. Otte <wotte@dre.vanderbilt.edu> | 2006-07-24 15:50:30 +0000 |
---|---|---|
committer | William R. Otte <wotte@dre.vanderbilt.edu> | 2006-07-24 15:50:30 +0000 |
commit | c44379cc7d9c7aa113989237ab0f56db12aa5219 (patch) | |
tree | 66a84b20d47f2269d8bdc6e0323f338763424d3a /ACE/examples/Web_Crawler | |
parent | 3aff90f4a822fcf5d902bbfbcc9fa931d6191a8c (diff) | |
download | ATCD-c44379cc7d9c7aa113989237ab0f56db12aa5219.tar.gz |
Repo restructuring
Diffstat (limited to 'ACE/examples/Web_Crawler')
27 files changed, 3362 insertions, 0 deletions
diff --git a/ACE/examples/Web_Crawler/.cvsignore b/ACE/examples/Web_Crawler/.cvsignore new file mode 100644 index 00000000000..ba2906d0666 --- /dev/null +++ b/ACE/examples/Web_Crawler/.cvsignore @@ -0,0 +1 @@ +main diff --git a/ACE/examples/Web_Crawler/Command_Processor.cpp b/ACE/examples/Web_Crawler/Command_Processor.cpp new file mode 100644 index 00000000000..83289095444 --- /dev/null +++ b/ACE/examples/Web_Crawler/Command_Processor.cpp @@ -0,0 +1,128 @@ +// $Id$ + +#include "ace/OS_NS_string.h" +#include "URL.h" +#include "HTTP_URL.h" +#include "Options.h" +#include "Command_Processor.h" +#include "URL_Visitor.h" + +ACE_RCSID(Web_Crawler, Command_Processor, "$Id$") + +Command::~Command (void) +{ +} + +URL_Command::URL_Command (URL *url) + : url_ (url) +{ +} + +int +URL_Command::execute (void) +{ + + ACE_CString check_string + (ACE_TEXT_ALWAYS_CHAR (this->url_->url_addr ().get_path_name ())); + if (check_string.find ("news:") != ACE_CString::npos) + return 0; + + if (check_string.find (".cgi") != ACE_CString::npos) + return 0; + + if (check_string.find ("mailto") != ACE_CString::npos) + return 0; + + if (check_string.find (".gif") != ACE_CString::npos) + return 0; + + if (check_string.find (".pdf") != ACE_CString::npos) + return 0; + + if (check_string.find (".map") != ACE_CString::npos) + return 0; + + if (check_string.find (".bmp") != ACE_CString::npos) + return 0; + + if (check_string.find (".jpg") != ACE_CString::npos) + return 0; + + if (this->url_->accept (OPTIONS::instance ()->visitor ()) !=0) + { + ACE_DEBUG ((LM_DEBUG, + "Coudnt accept url\n")); + return -1; + } + return 0; +} + +int +URL_Command::destroy (void) +{ + delete this; + return 0; +} +Command_Processor::Command_Processor (void) +{ +} + +Command_Processor::~Command_Processor (void) +{ +} + +int +Command_Processor::destroy (void) +{ + delete this; + return 0; +return 0; +} + +int +Command_Processor::execute (void) +{ + Command *command; + while (this->url_queue_.is_empty () != 1) + { + if (this->url_queue_.dequeue_head (command) != 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", "dequeue_head"), + -1); + URL_Command *url_command = dynamic_cast<URL_Command *> (command); + Auto_Destroyer<URL_Command> url_command_ptr (url_command); + if (url_command_ptr->execute () != 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", "Couldnt execute command"), + -1); + } + return 0; +} + +int +Command_Processor::insert (Command *command) +{ + // According to the order specified the commands are removed from the queue. + if (this->url_queue_.is_full() != 1) + { + if (ACE_OS::strcmp (OPTIONS::instance ()->order (), ACE_TEXT ("FIFO")) == 0) + { + if (this->url_queue_.enqueue_tail (command) !=0) + ACE_ERROR_RETURN ((LM_ERROR, + ACE_TEXT ("%p\n"), ACE_TEXT ("enqueue_tail")), + - 1); + } + if (ACE_OS::strcmp (OPTIONS::instance ()->order (), ACE_TEXT ("LIFO")) == 0) + { + if (this->url_queue_.enqueue_head (command) !=0) + ACE_ERROR_RETURN ((LM_ERROR, + ACE_TEXT ("%p\n"), ACE_TEXT ("enqueue_head")), + - 1); + } + } + return 0; +} + +#if defined (ACE_HAS_EXPLICIT_STATIC_TEMPLATE_MEMBER_INSTANTIATION) +template ACE_Singleton<Options, ACE_Null_Mutex> *ACE_Singleton<Options, ACE_Null_Mutex>::singleton_; +#endif /* ACE_HAS_EXPLICIT_STATIC_TEMPLATE_MEMBER_INSTANTIATION */ diff --git a/ACE/examples/Web_Crawler/Command_Processor.h b/ACE/examples/Web_Crawler/Command_Processor.h new file mode 100644 index 00000000000..742a316804c --- /dev/null +++ b/ACE/examples/Web_Crawler/Command_Processor.h @@ -0,0 +1,98 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// Command_Processor.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _COMMAND_PROCESSOR_H +#define _COMMAND_PROCESSOR_H + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +#include "ace/Containers.h" +#include "Options.h" + +// Forward decl. +class URL; + +class Command +{ + // = TITLE + // Abstract base class for a command. + // + // = DESCRIPTION + // Each command is executed by a <Command_Processor>. +public: + virtual ~Command (void); + // Virtual destructor. + + virtual int execute (void) = 0; + // This is the entry point to execute the command. + virtual int destroy (void) = 0; +}; + +class URL_Command : public Command +{ + // = TITLE + // Defines an API for executing a command on a URL. + // + // = DESCRIPTION + // Each command is executed by a <Command_Processor>. +public: + URL_Command (URL *); + // Constructor. + + virtual int execute (void); + // Execute the URL command. + + int destroy (void); + // Commit suicide. +private: + URL *url_; + // Pointer to the URL. +}; + +class Command_Processor +{ + // = TITLE + // Execute commands that are passed to it. + // + // = DESCRIPTION + // This class implements the Command Processor pattern. +public: + Command_Processor (void); + + int insert (Command *); + // Insert a new <Command> into the <Command_Processor>'s queue. + + int execute (void); + // Execute all the <Commands> in the queue. + + int destroy (void); + // Destroy the <Command_Processor>. + +private: + friend class ACE_Shutup_GPlusPlus; + // Turn off g++ warning + + ~Command_Processor (void); + // Ensure dynamic allocation. + + // @@ You fill in here... + ACE_Unbounded_Queue<Command *> url_queue_; +}; + + +#endif /* _COMMAND_PROCESSOR_H */ diff --git a/ACE/examples/Web_Crawler/HTTP_URL.cpp b/ACE/examples/Web_Crawler/HTTP_URL.cpp new file mode 100644 index 00000000000..44ceea324d4 --- /dev/null +++ b/ACE/examples/Web_Crawler/HTTP_URL.cpp @@ -0,0 +1,87 @@ +// $Id$ + +#include "ace/OS_NS_stdio.h" +#include "ace/OS_NS_string.h" +#include "ace/Auto_Ptr.h" +#include "URL_Visitor.h" +#include "Options.h" +#include "HTTP_URL.h" + +ACE_RCSID(Web_Crawler, HTTP_URL, "$Id$") + +const ACE_URL_Addr & +HTTP_URL::url_addr (void) const +{ + return this->url_addr_; +} + +HTTP_URL::HTTP_URL (const ACE_URL_Addr &url_addr, + HTTP_URL *cp) + : url_addr_ (url_addr), + containing_page_ (cp == 0 ? this : cp) +{ + ACE_DEBUG ((LM_DEBUG, "HTTP_URL %s\n", url_addr.addr_to_string ())); +} + +ssize_t +HTTP_URL::send_request (void) +{ + size_t commandsize = + ACE_OS::strlen (this->url_addr ().get_path_name ()) + + ACE_OS::strlen (this->url_addr ().get_host_name ()) + + 20 // Extra + + 1 // NUL byte + + 16; // Protocol filler... + + char *command; + ACE_NEW_RETURN (command, + char[commandsize], + -1); + + // Ensure that the <command> memory is deallocated. + ACE_Auto_Basic_Array_Ptr<char> cmd_ptr (command); + + ACE_OS::sprintf (cmd_ptr.get (), + "GET /%s HTTP/1.1\r\n", + ACE_TEXT_ALWAYS_CHAR (this->url_addr ().get_path_name ())); + + // Send the GET command to the connected server. + if (this->stream ().send_n (cmd_ptr.get (), + ACE_OS::strlen (cmd_ptr.get ()), + const_cast<ACE_Time_Value *> + (OPTIONS::instance ()->timeout ())) > 0) + { + ACE_OS::sprintf (cmd_ptr.get (), + "Host: %s\r\n\r\n", + this->url_addr ().get_host_name ()); + + // IMP: The length of teh command has to be sent! + ssize_t retval = + this->stream ().send_n (cmd_ptr.get (), + ACE_OS::strlen (cmd_ptr.get ()), + const_cast<ACE_Time_Value *> + (OPTIONS::instance ()->timeout ())); + this->stream ().svc_handler ()->idle (0); + if (retval <= 0) + return -1; + else + return retval; + } + else + return -1; +} + +int +HTTP_URL::accept (URL_Visitor *visitor) +{ + // This is part of the visitor pattern. + return visitor->visit (*this); +} + +int +HTTP_URL::destroy (void) +{ + delete this; + return 0; + // Commit suicide! +} diff --git a/ACE/examples/Web_Crawler/HTTP_URL.h b/ACE/examples/Web_Crawler/HTTP_URL.h new file mode 100644 index 00000000000..a926bb47938 --- /dev/null +++ b/ACE/examples/Web_Crawler/HTTP_URL.h @@ -0,0 +1,64 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// HTTP_URL.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _HTTP_URL_H +#define _HTTP_URL_H + +#include "URL_Status.h" +#include "URL.h" +#include "Options.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +class HTTP_URL : public URL +{ + // = TITLE + // An ADT for an HTTP URL. + // + // = DESCRIPTION + // This class plays the "element" role in the Visitor pattern. +public: + HTTP_URL (const ACE_URL_Addr &url_addr, + HTTP_URL *containing_page = 0); + // The <url_addr> is the URL that we're going to be visiting. We + // also keep track of the containing page, if any, which is used to + // print out more meaningful messages. + + virtual int accept (URL_Visitor *visitor); + // Accept the visitor, which will then perform a particular + // visitation strategy on the URL. This method is part of the + // Visitor pattern. + + virtual ssize_t send_request (void); + // Send a <GET> command to fetch the contents in the URI from the + // server. + + virtual const ACE_URL_Addr &url_addr (void) const; + // Returns the URL that we represent. + + int destroy (void); + // Commit suicide +private: + ACE_URL_Addr url_addr_; + // Address of the URL we're connected to. + + HTTP_URL *containing_page_; + // Page that contained us. +}; + +#endif /* _HTTP_URL_H */ diff --git a/ACE/examples/Web_Crawler/Iterators.cpp b/ACE/examples/Web_Crawler/Iterators.cpp new file mode 100644 index 00000000000..98b4f999622 --- /dev/null +++ b/ACE/examples/Web_Crawler/Iterators.cpp @@ -0,0 +1,163 @@ +// $Id$ + +#include "Options.h" +#include "Iterators.h" + +ACE_RCSID(Web_Crawler, Iterators, "$Id$") + +URL_Iterator::~URL_Iterator (void) +{ +} + +int +URL_Iterator::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} + +HTML_Body_Iterator::HTML_Body_Iterator (URL &url) + : url_ (url) +{ +} + +int +HTML_Body_Iterator::next (ACE_CString &url) +{ + size_t len = BUFSIZ; + const char *buf; + ACE_CString buffer; + int href_index = 0; + + for (buf = this->url_.stream ().recv (len); + buf > 0; + buf = this->url_.stream ().recv (len)) + { + + buffer.set (buf, BUFSIZ, 1); + + href_index = buffer.find ("HREF"); + + if (href_index < 0) + href_index = buffer.find ("href"); + + // Grep fpr " and grab the string until end-" + if ( href_index > 0) + { + // Get back to buffer start location. + this->url_.stream ().seek (-1 * static_cast<off_t> (len), + SEEK_CUR); + + int start_index = buffer.find ('\"', + href_index); + if (start_index <= 0) + break; + + start_index += href_index; + + int end_index = buffer.find ('\"', + start_index + 1); + if (end_index <= 0) + break; + + end_index += start_index + 1; + + ssize_t url_len = end_index - (start_index + 1); + + ACE_CString temp = buffer.substring (start_index + 1, + url_len); + url.set (temp.c_str (), len, 1); + + this->url_.stream ().seek (end_index + 1); + + return url_len; + } + } + return 0; + +} + +HTTP_Header_Iterator::HTTP_Header_Iterator (URL &url) + : url_ (url), + end_of_header_ (0) +{ +} + +int +HTTP_Header_Iterator::next (ACE_CString &line) +{ + if (this->end_of_header_) + return 0; + else + { + for (char c; + (c = this->url_.stream ().get_char ()) != (char)EOF; + ) + { + // Check to see if we're at the end of the header line. + if (c == '\r' && this->url_.stream ().peek_char (0) == '\n') + { + line.set (this->url_.stream ().recv (), + this->url_.stream ().recv_len () - 1, + 1); + + // Check to see if we're at the end of the header. + if (this->url_.stream ().peek_char (1) == '\r' + && this->url_.stream ().peek_char (2) == '\n') + { + this->end_of_header_ = 1; + // We're at the end of the header section. + this->url_.stream ().seek (3); + } + else + // We're at the end of the line. + this->url_.stream ().seek (1); + + return 1; + } + // Handle broken Web servers that use '\n' instead of + // '\r\n'. + else if (c == '\n') + { + line.set (this->url_.stream ().recv (), + (this->url_.stream ().recv_len ()), + 1); + + // Check to see if we're at the end of the header. + if (this->url_.stream ().peek_char (0) == '\n') + { + // We're at the end of the header section. + this->url_.stream ().seek (1); + this->end_of_header_ = 1; + } + + return 1; + } + } + + } + return 0; +} + +URL_Download_Iterator::URL_Download_Iterator (URL &url) + : url_ (url) +{ +} + +int +URL_Download_Iterator::next (ACE_CString &buffer) +{ + size_t len = BUFSIZ; + + const char *buf = this->url_.stream ().recv (len); + + + if (buf == 0) + return 0; + else + { + buffer.set (buf, len, 1); + return 1; + } +} diff --git a/ACE/examples/Web_Crawler/Iterators.h b/ACE/examples/Web_Crawler/Iterators.h new file mode 100644 index 00000000000..b5d267f7afb --- /dev/null +++ b/ACE/examples/Web_Crawler/Iterators.h @@ -0,0 +1,117 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// Iterators.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _ITERATORS_H +#define _ITERATORS_H + +#include "URL.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +class URL_Iterator +{ + // = TITLE + // An abstract base class that defines an iterator. + // + // = DESCRIPTION + // Subclasses of this base class can define what strings + // to return from <next>. This class decouples higher-level + // software from the details of whatever type of URL header or + // body we're iterating over. +public: + // = Initialization and termination methods. + virtual int destroy (void); + // "virtual" destructor. + + // = Iterator methods. + virtual int next (ACE_CString &string) = 0; + // Pass back the next <string> that hasn't been seen yet. Returns 0 + // when all items have been seen, else 1. + +protected: + virtual ~URL_Iterator (void); + // C++ destructor. +}; + +class HTML_Body_Iterator : public URL_Iterator +{ + // = TITLE + // An iterator that returns URLs embedded in HTML files. +public: + // = Initialization and termination methods. + HTML_Body_Iterator (URL &url); + // Constructor. + + // = Iterator methods. + virtual int next (ACE_CString &url); + // Pass back the next <url> that hasn't been seen in the + // memory-mapped file. Returns 0 when all items have been seen, + // else 1. + +private: + URL &url_; + // HTTP URL that we're iterating over. +}; + +class HTTP_Header_Iterator : public URL_Iterator +{ + // = TITLE + // An iterator that iterates over the HTTP header. +public: + // = Initialization and termination methods. + HTTP_Header_Iterator (URL &url); + // Constructor. + + // = Iterator methods. + virtual int next (ACE_CString &line); + // Pass back the next <line> that hasn't been seen in the + // memory-mapped file header. Returns 0 when we've reached the end + // of the header. seen, else 1. + +private: + URL &url_; + // HTTP URL that we're iterating over. + + int end_of_header_; + // We've found the end of the header, which means this iterator is + // finished. +}; + +class URL_Download_Iterator : public URL_Iterator +{ + // = TITLE + // An iterator that iterates over the contents of an entire URL, + // i.e., both header and body, and returns it in <BUFSIZ> + // <buffer>s. +public: + // = Initialization and termination methods. + URL_Download_Iterator (URL &url); + // Constructor. + + // = Iterator methods. + virtual int next (ACE_CString &buffer); + // Pass back the next <buffer> data from the stream, where + // <buffer.size> <= <BUFSIZ> . Returns 0 when we've reached the end + // of the header, else 1. + +private: + URL &url_; + // HTTP URL that we're iterating over. +}; + +#endif /* _ITERATORS_H */ diff --git a/ACE/examples/Web_Crawler/Makefile.am b/ACE/examples/Web_Crawler/Makefile.am new file mode 100644 index 00000000000..00a30a4d2e1 --- /dev/null +++ b/ACE/examples/Web_Crawler/Makefile.am @@ -0,0 +1,60 @@ +## Process this file with automake to create Makefile.in +## +## $Id$ +## +## This file was generated by MPC. Any changes made directly to +## this file will be lost the next time it is generated. +## +## MPC Command: +## /acebuilds/ACE_wrappers-repository/bin/mwc.pl -include /acebuilds/MPC/config -include /acebuilds/MPC/templates -feature_file /acebuilds/ACE_wrappers-repository/local.features -noreldefs -type automake -exclude build,Kokyu + +ACE_BUILDDIR = $(top_builddir) +ACE_ROOT = $(top_srcdir) + + +## Makefile.Web_Crawler.am + +if !BUILD_ACE_FOR_TAO +noinst_PROGRAMS = main + +main_CPPFLAGS = \ + -I$(ACE_ROOT) \ + -I$(ACE_BUILDDIR) + +main_SOURCES = \ + Command_Processor.cpp \ + HTTP_URL.cpp \ + Iterators.cpp \ + Mem_Map_Stream.cpp \ + Options.cpp \ + URL.cpp \ + URL_Addr.cpp \ + URL_Status.cpp \ + URL_Visitor.cpp \ + URL_Visitor_Factory.cpp \ + Web_Crawler.cpp \ + main.cpp \ + Command_Processor.h \ + HTTP_URL.h \ + Iterators.h \ + Mem_Map_Stream.h \ + Options.h \ + URL.h \ + URL_Addr.h \ + URL_Status.h \ + URL_Visitor.h \ + URL_Visitor_Factory.h \ + Web_Crawler.h + +main_LDADD = \ + $(ACE_BUILDDIR)/ace/libACE.la + +endif !BUILD_ACE_FOR_TAO + +## Clean up template repositories, etc. +clean-local: + -rm -f *~ *.bak *.rpo *.sym lib*.*_pure_* core core.* + -rm -f gcctemp.c gcctemp so_locations *.ics + -rm -rf cxx_repository ptrepository ti_files + -rm -rf templateregistry ir.out + -rm -rf ptrepository SunWS_cache Templates.DB diff --git a/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp b/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp new file mode 100644 index 00000000000..dda1d465a71 --- /dev/null +++ b/ACE/examples/Web_Crawler/Mem_Map_Stream.cpp @@ -0,0 +1,240 @@ +// $Id$ + +#include "ace/FILE_Addr.h" +#include "ace/Auto_Ptr.h" +#include "Options.h" +#include "Mem_Map_Stream.h" + +ACE_RCSID(Web_Crawler, Mem_Map_Stream, "$Id$") + +ACE_SOCK_Stream & +Mem_Map_Stream::stream (void) +{ + return svc_handler_->peer (); +} + +ssize_t +Mem_Map_Stream::send_n (const void *buf, size_t size, ACE_Time_Value *tv) +{ + return svc_handler_->peer ().send_n (buf, size, 0, tv); +} + +int +Mem_Map_Stream::eof (void) const +{ + return this->get_pos_ >= this->end_of_mapping_plus1_; +} + +int +Mem_Map_Stream::get_char (void) +{ + if (this->eof () && this->grow_file_and_remap () == -1) + return EOF; + + return *this->get_pos_++; +} + +int +Mem_Map_Stream::rewind (void) +{ + this->recv_pos_ = + reinterpret_cast<char *> (this->mem_map_.addr ()); + this->get_pos_ = this->recv_pos_; + this->end_of_mapping_plus1_ = + this->recv_pos_ + this->mem_map_.size (); + return 0; +} + +int +Mem_Map_Stream::peek_char (size_t offset) +{ + // We may need to iterate if the size of <n> is large. + while (this->get_pos_ + offset >= this->end_of_mapping_plus1_) + if (this->grow_file_and_remap () == -1) + return EOF; + + return this->get_pos_[offset]; +} + +const char * +Mem_Map_Stream::recv (void) const +{ + return this->recv_pos_; +} + +const char * +Mem_Map_Stream::recv (size_t &len) +{ + if (this->eof () && this->grow_file_and_remap () == -1) + return 0; + + + const char *s = this->recv_pos_; + off_t olen = static_cast <off_t> (len); + this->seek (olen, SEEK_CUR); + len = this->get_pos_ - s; + return s; +} + +size_t +Mem_Map_Stream::recv_len (void) const +{ + return this->get_pos_ - this->recv_pos_; +} + +const char * +Mem_Map_Stream::peek_str (size_t offset, + size_t size) +{ + // We will iterate if the size of <offset> is large. + while (this->get_pos_ + (offset + size) > this->end_of_mapping_plus1_) + if (this->grow_file_and_remap () == -1) + return 0; + + return &this->get_pos_[offset]; +} + +off_t +Mem_Map_Stream::seek (off_t offset, int whence) +{ + switch (whence) + { + case SEEK_SET: + this->get_pos_ = + reinterpret_cast<char *> (this->mem_map_.addr ()) + + offset; + break; + + case SEEK_CUR: + this->get_pos_ += offset; + break; + + case SEEK_END: + this->get_pos_ = + this->end_of_mapping_plus1_ + offset; + // @@ Not sure how to implement this (yet). + ACE_NOTSUP_RETURN (-1); + break; + } + + // Make sure that the backing store will cover this. + while (this->get_pos_ > this->end_of_mapping_plus1_) + if (this->grow_file_and_remap () == -1) + return (off_t) -1; + + this->recv_pos_ = this->get_pos_; + return this->recv_pos_ - reinterpret_cast<char *> (this->mem_map_.addr ()); +} + +Mem_Map_Stream::Svc_Handler * +Mem_Map_Stream::svc_handler (void) +{ + return this->svc_handler_; +} + + +int +Mem_Map_Stream::open (STRAT_CONNECTOR *connector, + const ACE_INET_Addr &addr) +{ + svc_handler_ = 0; + + // Connect to the server at <addr>. If the handler has to be + // connected to the server again, the Caching strategy takes care + // and uses the same connection. + if (connector->connect (svc_handler_, + addr) == -1) + { + + ACE_ERROR_RETURN ((LM_ERROR, + "%p %s %d\n", + "Connect failed", + addr.get_host_name (), + addr.get_port_number ()), + -1); + } + // Create a temporary filename. + ACE_FILE_Addr file (ACE_sap_any_cast (ACE_FILE_Addr &)); + + // Create the temporary file via the <ACE_Mem_Map> class API. + if (this->mem_map_.open (file.get_path_name (), + O_RDWR | O_CREAT | O_APPEND, + ACE_DEFAULT_FILE_PERMS) == -1) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "open"), + -1); + // Make sure to unlink this right away so that if this process + // crashes these files will be removed automatically. +#if 0 + else if (ACE_OS::unlink (file.get_path_name ()) == -1) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "unlink"), + -1); + else +#endif + // Initialize all the position pointers to 0. + this->rewind (); + + return 0; +} + +int +Mem_Map_Stream::grow_file_and_remap (void) +{ + char buf[BUFSIZ + 1]; + + // Copy the next chunk of bytes from the socket into the temporary + // file. + ACE_Time_Value tv (*OPTIONS::instance ()->timeout ()); + + ssize_t n = this->svc_handler_->peer ().recv_n (buf, + sizeof buf, + 0, + &tv); + if (n == -1) + { + if (OPTIONS::instance ()->debug ()) + ACE_ERROR ((LM_ERROR, + "%p\n", + "recv")); + return -1; + } + else if (n == 0) + return -1; + else if (ACE::write_n (this->mem_map_.handle (), buf, n) != n) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "write_n"), + -1); + + // Grow the memory-mapping to encompass the entire temporary file. + if (this->mem_map_.map (-1, + PROT_RDWR, + ACE_MAP_PRIVATE | ACE_MAP_FIXED, + ACE_DEFAULT_BASE_ADDR) == -1) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "map"), + -1); + // MAP_FAILED is used as a "first time in" flag. + if (this->recv_pos_ == MAP_FAILED) + { + this->recv_pos_ = reinterpret_cast<char *> (this->mem_map_.addr ()); + this->get_pos_ = this->recv_pos_; + } + + this->end_of_mapping_plus1_ = + reinterpret_cast<char *> (this->mem_map_.addr ()) + + this->mem_map_.size (); + + return 0; +} + +Mem_Map_Stream::~Mem_Map_Stream (void) +{ + // Remove the mapping and the file. + this->mem_map_.remove (); +} + diff --git a/ACE/examples/Web_Crawler/Mem_Map_Stream.h b/ACE/examples/Web_Crawler/Mem_Map_Stream.h new file mode 100644 index 00000000000..3595f04ab77 --- /dev/null +++ b/ACE/examples/Web_Crawler/Mem_Map_Stream.h @@ -0,0 +1,190 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// Mem_Map_Stream.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _MEM_MAP_STREAM_H +#define _MEM_MAP_STREAM_H +#include /**/ "ace/pre.h" + +#include "ace/SOCK_Stream.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + + +#include "ace/Mem_Map.h" +#include "ace/SOCK_Connector.h" +#include "ace/Connector.h" +#include "ace/Svc_Handler.h" +#include "ace/Strategies_T.h" + +class Mem_Map_Stream +{ + // = TITLE + // Provides a memory-mapped stream abstraction to simplify parsing + // of tokens. + // + // = DESCRIPTION + // This class makes it possible to treat an connection as a stream + // of bytes, similar to the C library stdio streams. The contents + // of the connection are buffered incrementally in a memory-mapped + // file. This class maintains pointers to two positions in the + // stream: + // + // 1. The <recv> position, which keeps track of the beginning of a + // token that is in the stream. + // + // 2. The <get> position, which moves along character-by-character + // until the end of the token is reached. + // + // Once a token has been located, it can be extracted from the + // stream by calling the <recv>. The length of the token, i.e., + // the <recv_len>, is the length in bytes between the <get> + // position and the <recv> position. Once the token has been + // extracted, the <recv> and <get> positions can be updated by the + // <seek> method. + +public: + typedef ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_NULL_SYNCH> Svc_Handler; + + typedef ACE_Strategy_Connector<Svc_Handler, + ACE_SOCK_CONNECTOR> + STRAT_CONNECTOR; + + // Mem_Map_Stream (void); + // constructor added:KIRTHIKA + virtual int open (STRAT_CONNECTOR *connector, + const ACE_INET_Addr &); + // Initialize this object. + + virtual ~Mem_Map_Stream (void); + // Destructor. + + // = Accessor. + ACE_SOCK_Stream &stream (void); + // Returns the underlying <ACE_SOCK_Stream>. + + // = I/O methods. + + virtual ssize_t send_n (const void *buf, + size_t size, + ACE_Time_Value *tv = 0); + // Send <size> bytes in <buf> to the connected peer. This is a + // completely unbuffered call. + + virtual int get_char (void); + // Return the next character in the stream and advance the <get> + // position. Returns EOF when the <get> position reaches the end of + // the HTTP stream. + + virtual const char *recv (size_t &len); + // Returns a pointer to array of at most <len> characters starting + // at the <recv> position. If the <recv> position + <len> extends + // past the EOF then <len> is set to the number of characters + // between the <recv> position and the EOF and both the <get> and + // <recv> positions are advanced by <len>. Returns 0 if the <recv> + // position is at the EOF. + + virtual const char *recv (void) const; + // Returns a pointer to array of characters starting at the <recv> + // position. + + virtual size_t recv_len (void) const; + // Returns the length in bytes between the <get> position and the + // <recv> position. + + virtual int rewind (void); + // Resets the <get> and <recv> positions to the beginning of the + // stream. This works since all the data has been cached in the + // memory-mapped backing store. + + virtual int peek_char (size_t offset); + // Returns the nth character <offset> from the <get> position in the + // stream without advancing the <get> position. Automatically + // extends the backing store if necessary. Returns EOF if <offset> + // is past the end of the stream. + + virtual const char *peek_str (size_t offset, size_t size); + // Return a pointer to an array of <size> characters starting at + // <offset> characters from the <get> position in the stream without + // advancing the <get> position. Automatically extends the backing + // store if necessary. Returns 0 if <offset> or <offset + size> is + // past the end of the stream. + + virtual off_t seek (off_t offset, int whence = SEEK_CUR); + // Sets the <get> and <recv> positions as follows: + // o If <whence> is <SEEK_SET>, the positions are set to <offset> + // bytes from the start of the stream. + // + // o If <whence> is <SEEK_CUR>, the positions are set to the + // current <get> position plus <offset>. + // + // o If <whence> is <SEEK_END>, the positions are set to the size + // of the stream plus <offset>. + + virtual int eof (void) const; + // Returns 1 if we're at the end of the HTTP stream, else 0. + + + /* + typedef ACE_NOOP_Creation_Strategy<Svc_Handler> + NULL_CREATION_STRATEGY; + typedef ACE_NOOP_Concurrency_Strategy<Svc_Handler> + NULL_ACTIVATION_STRATEGY; + typedef ACE_Cached_Connect_Strategy<Svc_Handler, + ACE_SOCK_CONNECTOR, + ACE_SYNCH_NULL_MUTEX> + CACHED_CONNECT_STRATEGY;*/ + + Svc_Handler *svc_handler (void); + +private: + int grow_file_and_remap (void); + // Grow the file by reading another chunk from the HTTP socket and + // extend the mapping to cover this chunk. Returns -1 on failure or + // EOF, else 0. + + //ACE_SOCK_Stream stream_; + + Svc_Handler *svc_handler_; + // Connection to peer. The granularity is at the Svc_Handler level. + // The Svc_Handler has an SOCK_Stream. + /* + NULL_CREATION_STRATEGY creation_strategy_; + NULL_ACTIVATION_STRATEGY activation_strategy_; + // Configure the Strategy Connector with a strategy that caches + // connection. + CACHED_CONNECT_STRATEGY caching_connect_strategy_; + + STRAT_CONNECTOR *strat_connector_; */ + + ACE_Mem_Map mem_map_; + // Memory-mapped file that we're iterating over. + + char *recv_pos_; + // Pointer to the address where the next <recv> method will start. + + char *get_pos_; + // Pointer to the address where the next <get_char> method will + // start. + + char *end_of_mapping_plus1_; + // Address at the end of the file mapping. + +}; + +#include /**/ "ace/post.h" +#endif /* _MEM_MAP_STREAM_H */ diff --git a/ACE/examples/Web_Crawler/Options.cpp b/ACE/examples/Web_Crawler/Options.cpp new file mode 100644 index 00000000000..389cbfa0733 --- /dev/null +++ b/ACE/examples/Web_Crawler/Options.cpp @@ -0,0 +1,177 @@ +// $Id$ + +#include "ace/Get_Opt.h" +#include "ace/Log_Msg.h" +#include "URL_Addr.h" +#include "Options.h" +#include "ace/OS_NS_string.h" + +ACE_RCSID(Web_Crawler, Options, "$Id$") + +int +Options::parse_args (int argc, ACE_TCHAR *argv[]) +{ + ACE_Get_Opt getopt (argc, argv, ACE_TEXT ("df:h:i:l:rt:u:vo:p:")); + + ACE_LOG_MSG->open (argv[0]); + + this->hostname_ = ACE_TEXT ("www.cs.wustl.edu"); + this->uri_ = ACE_TEXT ("index.html"); + this->recurse_ = 0; + this->debug_ = 0; + this->timeout_.sec (ACE_DEFAULT_TIMEOUT); + this->url_filter_ = 0; + this->verbose_ = 0; + this->order_ = ACE_TEXT ("FIFO"); + this->port_no_ = ACE_DEFAULT_HTTP_PORT; + + // The default is to make this limit as large as possible. + this->handle_limit_ = -1; + + for (int c; + (c = getopt ()) != EOF; + ) + switch (c) + { + case ACE_TEXT ('d'): + this->debug_ = 1; + break; + case ACE_TEXT ('f'): + this->url_filter_ = getopt.opt_arg (); + break; + case ACE_TEXT ('h'): + this->hostname_ = getopt.opt_arg (); + break; + case ACE_TEXT ('i'): + this->uri_ = getopt.opt_arg (); + break; + case ACE_TEXT ('l'): + this->handle_limit_ = ACE_OS::atoi (getopt.opt_arg ()); + break; + case ACE_TEXT ('r'): + this->recurse_ = 1; + break; + case ACE_TEXT ('t'): + this->timeout_.sec (ACE_OS::atoi (getopt.opt_arg ())); + break; + case ACE_TEXT ('u'): + { + this->hostname_ = getopt.opt_arg (); + ACE_TCHAR *s = ACE_OS::strchr (getopt.opt_arg (), ACE_TEXT ('/')); + if (s != 0) + { + this->uri_ = s + 1; + *s = ACE_TEXT ('\0'); + } + else + ACE_ERROR ((LM_ERROR, + ACE_TEXT ("invalid URL %s\n"), + getopt.opt_arg ())); + } + break; + case ACE_TEXT ('v'): + this->verbose_ = 1; + break; + case ACE_TEXT ('o'): + { + this->order_ = getopt.opt_arg (); + } + break; + case ACE_TEXT ('p'): + this->port_no_ = ACE_OS::atoi (getopt.opt_arg ()); + break; + default: + ACE_ERROR ((LM_ERROR, + ACE_TEXT ("usage: %n [-d] [-f filter] [-h hostname]") + ACE_TEXT (" [-l handle-limit] [-r] [-t timeout] [-u URI]") + ACE_TEXT (" [-v]\n%a"), + 1)); + + /* NOTREACHED */ + } + + return 0; +} + +int +Options::port_no (void) const +{ + return this->port_no_; +} + +int +Options::recurse (void) const +{ + return this->recurse_; +} + +const ACE_Time_Value * +Options::timeout (void) const +{ + return &this->timeout_; +} + +int +Options::debug (void) const +{ + return this->debug_; +} + +int +Options::verbose (void) const +{ + return this->verbose_; +} + +const ACE_TCHAR * +Options::order (void) const +{ + return this->order_; +} +const ACE_TCHAR * +Options::hostname (void) const +{ + return this->hostname_; +} + +const ACE_TCHAR * +Options::path_name (void) const +{ + return this->uri_; +} + +const ACE_TCHAR * +Options::url_filter (void) const +{ + return this->url_filter_; +} + +Command_Processor * +Options::command_processor (void) const +{ + return this->command_processor_; +} + +void +Options::command_processor (Command_Processor *cp) +{ + this->command_processor_ = cp; +} + +URL_Visitor * +Options::visitor (void) const +{ + return this->visitor_; +} + +void +Options::visitor (URL_Visitor *v) +{ + this->visitor_ = v; +} + +int +Options::handle_limit (void) +{ + return this->handle_limit_; +} diff --git a/ACE/examples/Web_Crawler/Options.h b/ACE/examples/Web_Crawler/Options.h new file mode 100644 index 00000000000..ef5f2efd40c --- /dev/null +++ b/ACE/examples/Web_Crawler/Options.h @@ -0,0 +1,124 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// Options.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _OPTIONS_H +#define _OPTIONS_H + +#include "ace/Null_Mutex.h" +#include "ace/Singleton.h" +#include "ace/Time_Value.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +// Forward decls. +class Command_Processor; +class URL_Visitor; + +class Options +{ + // = TITLE + // Maintains the global options. + // + // = DESCRIPTION + // This class is converted into a Singleton by the + // <ACE_Singleton> template. +public: + int parse_args (int argc, ACE_TCHAR *argv[]); + // Parse the command-line arguments and initialize the options. + + int recurse (void) const; + // If non-0 and the link is an HTML file then recursively check all + // links that are embedded in the body of file. + + const ACE_TCHAR *hostname (void) const; + // Return the hostname of the initial Web server. + + const ACE_TCHAR *path_name (void) const; + // Return the initial URI. + + const ACE_TCHAR *url_filter (void) const; + // String used to filter out which URLs to validate. + + int debug (void) const; + // Are we debugging? + + int verbose (void) const; + // Are we being verbose? + + const ACE_TCHAR *order (void) const; + // Which order? LIFO|FIFO?? + + int port_no (void) const; + // Port # + + const ACE_Time_Value *timeout (void) const; + // Return the timeout used to prevent hanging on <recv> and + // <connect> calls to broken servers. + + // = Get/set the <Command_Processor>. + Command_Processor *command_processor (void) const; + void command_processor (Command_Processor *); + + // = Get/set the <URL_Visitor>. + URL_Visitor *visitor (void) const; + void visitor (URL_Visitor *); + + // Get the handle_limit. + int handle_limit (void); +private: + int recurse_; + // Are we recursving. + + const ACE_TCHAR *hostname_; + // Initial Web server name. + + const ACE_TCHAR *uri_; + // Initial URI name. + + int debug_; + // Are we debugging? + + int verbose_; + // Are we being verbose? + + const ACE_TCHAR *order_; + // Whether the URLs are traversed in FIFO or LIFO order. + + ACE_Time_Value timeout_; + // Timeout on <recv> and <connect> to broken Web servers. + + const ACE_TCHAR *url_filter_; + // String used to filter out which URLs to validate. + + Command_Processor *command_processor_; + // Pointer to the Command_Processor. + + URL_Visitor *visitor_; + // Pointer to the <URL_Visitor>. + + int port_no_; + // Port no. + + int handle_limit_; + // The limit of the number of descriptors to be given for this process. +}; + +// Typedef an Options Singleton. +typedef ACE_Singleton <Options, ACE_Null_Mutex> OPTIONS; + +#endif /* _OPTIONS_H */ diff --git a/ACE/examples/Web_Crawler/README b/ACE/examples/Web_Crawler/README new file mode 100644 index 00000000000..4f81809173d --- /dev/null +++ b/ACE/examples/Web_Crawler/README @@ -0,0 +1,25 @@ +Web Crawler Kirthika Parameswaran +----------- + +The Web Crawler follows the HTTP_1.1 protocol. + +This Crawler crawls in either FIFO or LIFO order over the URLs +now stored in a ACE_Unbounded_Queue. The Command Processor pattern is +used in this example. + +Also the auto-purging feature where connections are removed from the cache +when the process runs out of file descriptors, is added to this example. + +[Use the -l option to set the handle limit]. + +Run: +--- + + +> make + +> main -r -u www.cs.wustl.edu/~kirthika/test.html -o LIFO + +or + +> main -r -u www.cs.wustl.edu/~kirthika/test.html -o FIFO diff --git a/ACE/examples/Web_Crawler/URL.cpp b/ACE/examples/Web_Crawler/URL.cpp new file mode 100644 index 00000000000..ce52ed892ad --- /dev/null +++ b/ACE/examples/Web_Crawler/URL.cpp @@ -0,0 +1,39 @@ +// $Id$ + +#include "URL.h" + +ACE_RCSID(Web_Crawler, URL, "$Id$") + +Mem_Map_Stream & +URL::stream (void) +{ + return this->stream_; +} + +URL::~URL (void) +{ +} + +const URL_Status & +URL::reply_status (void) +{ + return this->reply_status_; +} + +void +URL::reply_status (const URL_Status &rs) +{ + this->reply_status_ = rs; +} + +const ACE_CString & +URL::content_type (void) +{ + return this->content_type_; +} + +void +URL::content_type (const ACE_CString &ct) +{ + this->content_type_ = ct; +} diff --git a/ACE/examples/Web_Crawler/URL.h b/ACE/examples/Web_Crawler/URL.h new file mode 100644 index 00000000000..68c41f018ad --- /dev/null +++ b/ACE/examples/Web_Crawler/URL.h @@ -0,0 +1,82 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// URL.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _URL_H +#define _URL_H + +#include "Mem_Map_Stream.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +#include "URL_Addr.h" +#include "URL_Status.h" + +#include "ace/SString.h" + + +// Forward declaration. +class URL_Visitor; + +class URL +{ + // = TITLE + // Base class for a URL. + // + // = DESCRIPTION + // This class plays a role in the Visitor pattern. +public: + virtual ~URL (void); + // Destructor. + + virtual int accept (URL_Visitor *visitor) = 0; + // Accept the visitor, which will then perform a particular + // visitation strategy on the URL. This method is part of the + // Visitor pattern. + + virtual ssize_t send_request (void) = 0; + // Send a <GET> command to fetch the contents in the URI from the + // server. + + virtual const ACE_URL_Addr &url_addr (void) const = 0; + // Returns the URL that we represent. + + virtual Mem_Map_Stream &stream (void); + // Returns the <Mem_Map_Stream>. + + // = Get/set the reply status. + virtual const URL_Status &reply_status (void); + virtual void reply_status (const URL_Status &); + + // = Get/set the reply status. + virtual const ACE_CString &content_type (void); + virtual void content_type (const ACE_CString &); + + + +private: + URL_Status reply_status_; + // Reply status of the URL. + + ACE_CString content_type_; + // Content-type of the URL. + + Mem_Map_Stream stream_; + // Contents of the stream. +}; + +#endif /* _URL_H */ diff --git a/ACE/examples/Web_Crawler/URL_Addr.cpp b/ACE/examples/Web_Crawler/URL_Addr.cpp new file mode 100644 index 00000000000..5a630e387fb --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Addr.cpp @@ -0,0 +1,234 @@ +// $Id$ + +#include "URL_Addr.h" +#include "ace/Log_Msg.h" +#include "ace/OS_NS_string.h" +#include "ace/OS_NS_stdio.h" +#include "ace/OS_NS_stdlib.h" +#include "ace/OS_Memory.h" + +ACE_RCSID (Web_Crawler, + URL_Addr, + "$Id$") + + +ACE_URL_Addr::ACE_URL_Addr (void) + : path_name_ (0), + addr_string_ (0), + addr_string_len_ (0) +{ +} + +int +ACE_URL_Addr::addr_to_string (ACE_TCHAR *s, + size_t size, + int ipaddr_format) const +{ + const size_t total_len = + ACE_OS::strlen (ipaddr_format == 0 ? + this->get_host_name () : + this->get_host_addr ()) + + ACE_OS::strlen ("65536") // Assume the max port number. + + ACE_OS::strlen (this->get_path_name ()) + + sizeof (':') + + sizeof ('/') + + sizeof ('\0'); // For trailing '\0'. + + if (size < total_len) + return -1; + else + { + ACE_OS::sprintf (s, ACE_TEXT ("%s:%d/%s"), + ACE_TEXT_CHAR_TO_TCHAR (ipaddr_format == 0 + ? this->get_host_name () + : this->get_host_addr ()), + this->get_port_number (), + this->get_path_name ()); + return 0; + } +} + +const ACE_TCHAR * +ACE_URL_Addr::addr_to_string (int ipaddr_format) const +{ + ACE_URL_Addr *this_ptr = const_cast<ACE_URL_Addr *> (this); + + size_t size = + ACE_OS::strlen (ipaddr_format == 0 ? + this->get_host_name () : + this->get_host_addr ()) + + ACE_OS::strlen ("65536") // Assume the max port number. + + ACE_OS::strlen (this->get_path_name ()) + + sizeof (':') + + sizeof ('/') + + sizeof ('\0'); // For trailing '\0'. + + if (size > this->addr_string_len_) + { + ACE_ALLOCATOR_RETURN (this_ptr->addr_string_, + (ACE_TCHAR *) ACE_OS::realloc ((void *) this->addr_string_, + size), + 0); + this_ptr->addr_string_len_ = size; + } + ACE_OS::sprintf (this->addr_string_, + ACE_TEXT ("%s:%d/%s"), + ACE_TEXT_CHAR_TO_TCHAR (ipaddr_format == 0 + ? this->get_host_name () + : this->get_host_addr ()), + this->get_port_number (), + this->get_path_name ()); + return this->addr_string_; +} + +int +ACE_URL_Addr::string_to_addr (const ACE_TCHAR *s) +{ + int result; + ACE_TCHAR *t; + + // Need to make a duplicate since we'll be overwriting the string. + ACE_ALLOCATOR_RETURN (t, + ACE_OS::strdup (s), + -1); + + + // First split off the path_name. + + ACE_TCHAR *path_name = ACE_OS::strchr (t, ACE_TEXT ('/')); + const ACE_TCHAR *name = ACE_TEXT ("index.html"); + if (path_name != 0) + { + if (ACE_OS::strlen (path_name + 1) > 0) + name = path_name + 1; + + *path_name = '\0'; + } + + ACE_ALLOCATOR_RETURN (this->path_name_, + // Skip over '/' + ACE_OS::strdup (name), + -1); + + // Now handle the host address and port number. + ACE_TCHAR *port_number = ACE_OS::strchr (t, ':'); + + if (port_number == 0) + { + // Assume it's an ip-address or ip-number. + result = this->ACE_INET_Addr::set (ACE_DEFAULT_HTTP_PORT, + t); + } + else + { + *port_number = '\0'; + u_short port = (u_short) ACE_OS::atoi (port_number + 1); // Skip over ':' + result = this->ACE_INET_Addr::set (port, t); + } + + ACE_OS::free (ACE_MALLOC_T (t)); + return result; +} + +ACE_URL_Addr::ACE_URL_Addr (const ACE_URL_Addr &addr) + : ACE_INET_Addr (), + path_name_ (0), + addr_string_ (0), + addr_string_len_ (0) +{ + if (this->set (addr) == -1) + ACE_ERROR ((LM_ERROR, + ACE_TEXT ("%p\n"), + ACE_TEXT ("ACE_URL_Addr::ACE_URL_Addr"))); +} + +int +ACE_URL_Addr::set (const ACE_URL_Addr &addr) +{ + ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *> + (this->path_name_))); + ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *> + (this->addr_string_))); + if (this->ACE_INET_Addr::set (addr) == -1) + return -1; + else + { + if (addr.path_name_) + ACE_ALLOCATOR_RETURN (this->path_name_, + ACE_OS::strdup (addr.path_name_), + -1); + if (addr.addr_string_) + ACE_ALLOCATOR_RETURN (this->addr_string_, + ACE_OS::strdup (addr.addr_string_), + -1); + this->addr_string_len_ = + addr.addr_string_len_; + return 0; + } +} + +void +ACE_URL_Addr::operator= (const ACE_URL_Addr &addr) +{ + if (this->set (addr) == -1) + ACE_ERROR ((LM_ERROR, + ACE_TEXT ("%p\n"), + ACE_TEXT ("ACE_URL_Addr::ACE_URL_Addr"))); +} + +u_long +ACE_URL_Addr::hash (void) const +{ + u_long result = this->ACE_INET_Addr::hash () + + ACE::hash_pjw (this->get_path_name ()); + + return result; +} + +bool +ACE_URL_Addr::operator== (const ACE_URL_Addr &addr) const +{ + return ACE_OS::strcmp (addr.get_path_name (), + this->get_path_name ()) == 0 + && addr.get_port_number () == this->get_port_number () + && addr.get_ip_address () == this->get_ip_address (); +} + +bool +ACE_URL_Addr::operator!= (const ACE_URL_Addr &addr) const +{ + return !(*this == addr); +} + +ACE_URL_Addr::ACE_URL_Addr (const ACE_TCHAR *host_name, + const ACE_TCHAR *path_name, + u_short port) + : ACE_INET_Addr (port, host_name), + path_name_ (ACE_OS::strdup (path_name)), + addr_string_ (0), + addr_string_len_ (0) +{ +} + +const ACE_TCHAR * +ACE_URL_Addr::get_path_name (void) const +{ + return this->path_name_; +} + +ACE_URL_Addr::~ACE_URL_Addr (void) +{ + ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *> + (this->path_name_))); + ACE_OS::free (reinterpret_cast<void *> (const_cast<ACE_TCHAR *> + (this->addr_string_))); + this->path_name_ = 0; +} + +int +ACE_URL_Addr::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} diff --git a/ACE/examples/Web_Crawler/URL_Addr.h b/ACE/examples/Web_Crawler/URL_Addr.h new file mode 100644 index 00000000000..9792e1bb390 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Addr.h @@ -0,0 +1,111 @@ +// -*- C++ -*- + +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// URL_Addr.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef ACE_URL_ADDR_H +#define ACE_URL_ADDR_H + +#include "ace/INET_Addr.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +#include "ace/ACE.h" + +class ACE_URL_Addr : public ACE_INET_Addr +{ + // = TITLE + // Defines a URL address family address format. +public: + // = Initialization and termination methods. + ACE_URL_Addr (void); + // Constructor. + + ACE_URL_Addr (const ACE_TCHAR *host_name, + const ACE_TCHAR *path_name, + u_short port = ACE_DEFAULT_HTTP_PORT); + + ACE_URL_Addr (const ACE_URL_Addr &addr); + // Copy constructor. + + int set (const ACE_URL_Addr &addr); + // Essentially the copy constructor. + + virtual int string_to_addr (const ACE_TCHAR *address); + // Initializes an <ACE_URL_Addr> from the <address>, which can be + // "ip-number:port-number/path-name" (e.g., + // "www.cs.wustl.edu:1234/~schmidt/" "ip-number:port-number/path-name" + // (e.g., "128.252.166.57:1234/~schmidt"). If there is no ':' in + // the <address> it is assumed to be an ip-number or ip-address + // number, with the port number <ACE_DEFAULT_HTTP_PORT>. + + virtual int addr_to_string (ACE_TCHAR *s, + size_t size, + int ipaddr_format = 1) const; + // Transform the current <ACE_INET_Addr> address into string format. + // If <ipaddr_format> is non-0 this produces + // "ip-number:port-number/path-name" (e.g., + // "128.252.166.57:80/~schmidt/"), whereas if <ipaddr_format> is 0 + // this produces "ip-name:port-number" (e.g., + // "www.cs.wustl.edu:80/~schmidt/"). Returns -1 if the <size> of + // the <buffer> is too small, else 0. + + virtual const ACE_TCHAR *addr_to_string (int ipaddr_format = 1) const; + // Transform the current <ACE_INET_Addr> address into string format. + // If <ipaddr_format> is non-0 this produces + // "ip-number:port-number/path-name" (e.g., + // "128.252.166.57:80/~schmidt/"), whereas if <ipaddr_format> is 0 + // this produces "ip-name:port-number" (e.g., + // "www.cs.wustl.edu:80/~schmidt/"). Uses dynamic memory, which + // is allocated on demand and deallocated when the object is + // destroyed. Returns -1 if dynamic memory fails, else 0. + + void operator= (const ACE_URL_Addr &addr); + // Assignment operator. + + ~ACE_URL_Addr (void); + // Destructor. + + bool operator == (const ACE_URL_Addr &SAP) const; + // Compare two addresses for equality. The addresses are considered + // equal if they contain the same IP address, port number, and path + // name. + + bool operator != (const ACE_URL_Addr &SAP) const; + // Compare two addresses for inequality. + + virtual u_long hash (void) const; + // Computes and returns hash value. + + const ACE_TCHAR *get_path_name (void) const; + // Return the path name. + + int destroy (void); + // Commit suicide. +private: + ACE_TCHAR *path_name_; + // Our path name. + + ACE_TCHAR *addr_string_; + // The dynamically address string that's used for the + // <addr_to_string> method. + + size_t addr_string_len_; + // Current length of the <addr_string_> +}; + +#endif /* ACE_URL_ADDR_H */ diff --git a/ACE/examples/Web_Crawler/URL_Status.cpp b/ACE/examples/Web_Crawler/URL_Status.cpp new file mode 100644 index 00000000000..35a57420593 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Status.cpp @@ -0,0 +1,40 @@ +/* -*- C++ -*- */ +// $Id$ + +#include "URL_Status.h" + +ACE_RCSID(Web_Crawler, URL_Status, "$Id$") + +URL_Status::URL_Status (STATUS_CODE code) + : status_ (code) +{ +} + +URL_Status::URL_Status (const URL_Status &s) + : status_ (s.status_) +{ +} + +URL_Status::STATUS_CODE +URL_Status::status (void) const +{ + return this->status_; +} + +void +URL_Status::status (int s) +{ + this->status_ = URL_Status::STATUS_CODE (s); +} + +void +URL_Status::status (URL_Status::STATUS_CODE s) +{ + this->status_ = s; +} + +int URL_Status::destroy (void) +{ + delete this; + return 0; +} diff --git a/ACE/examples/Web_Crawler/URL_Status.h b/ACE/examples/Web_Crawler/URL_Status.h new file mode 100644 index 00000000000..672c5e4f240 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Status.h @@ -0,0 +1,61 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// URL_Status.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _URL_STATUS_H +#define _URL_STATUS_H + +#include "ace/config-all.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +class URL_Status +{ + // = TITLE +public: + enum STATUS_CODE + { + STATUS_OK = 200, + STATUS_CREATED = 201, + STATUS_ACCEPTED = 202, + STATUS_NO_CONTENT = 204, + STATUS_MOVED_PERMANENTLY = 301, + STATUS_MOVED_TEMPORARILY = 302, + STATUS_NOT_MODIFIED = 304, + STATUS_BAD_REQUEST = 400, + STATUS_UNAUTHORIZED = 401, + STATUS_FORBIDDEN = 403, + STATUS_ITEM_NOT_FOUND = 404, + STATUS_INTERNAL_SERVER_ERROR = 500, + STATUS_OP_NOT_IMPLEMENTED = 501, + STATUS_BAD_GATEWAY = 502, + STATUS_SERVICE_UNAVAILABLE = 503, + STATUS_INSUFFICIENT_DATA = 399 + }; + + URL_Status (STATUS_CODE = STATUS_INSUFFICIENT_DATA); + URL_Status (const URL_Status &); + + STATUS_CODE status (void) const; + void status (int); + void status (STATUS_CODE); + int destroy (void); +private: + STATUS_CODE status_; +}; + +#endif /* _URL_STATUS_H */ diff --git a/ACE/examples/Web_Crawler/URL_Visitor.cpp b/ACE/examples/Web_Crawler/URL_Visitor.cpp new file mode 100644 index 00000000000..481a7140089 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Visitor.cpp @@ -0,0 +1,543 @@ +// $Id$ + +#include "ace/OS_NS_string.h" +#include "URL_Visitor.h" +#include "Command_Processor.h" + +ACE_RCSID(Web_Crawler, URL_Visitor, "$Id$") + +URL_Processing_Strategy::URL_Processing_Strategy (URL &url, + URL_Iterator &iterator) + : url_ (url), + iterator_ (iterator) +{ +} + +URL_Processing_Strategy::~URL_Processing_Strategy (void) +{ +} + +int +URL_Processing_Strategy::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} + +URL_Download_Strategy::URL_Download_Strategy (URL &url, + URL_Iterator &iterator) + : URL_Processing_Strategy (url, iterator) +{ +} + +int +URL_Download_Strategy::execute (void) +{ + ACE_CString buffer; + + // Extract all the contents of the Stream and print them to the + // file. + while (this->iterator_.next (buffer) != 0) + ACE_DEBUG ((LM_DEBUG, + "%s", + buffer.c_str ())); + + return 0; +} + +HTTP_Header_Processing_Strategy::HTTP_Header_Processing_Strategy (URL &url, + URL_Iterator &iterator) + : URL_Processing_Strategy (url, iterator) +{ +} + +int +HTTP_Header_Processing_Strategy::execute (void) +{ + // Set the get() position.Necessary since later a peek is done. + if (this->url_.stream ().get_char () == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n","Header Not Found"), + -1); + char line_buf[BUFSIZ + 1]; + ACE_CString line (line_buf); + // Get the lines in the header iteratively and check for status info. + int result = 1, i = 0; + for (i = 0, result = this->iterator_.next (line); + result > 0; + ++i, result = this->iterator_.next (line)) + { + if (i == 0) + { + // Assuming that the status-no is a space away. + int status_index = line.find ("HTTP", 0); + ACE_CString status = line.substring (status_index + 9, //HTTP/1.1 200 + 3); + + URL_Status *url_status = 0; + ACE_NEW_RETURN (url_status, + URL_Status, + 0); + Auto_Destroyer<URL_Status> url_status_ptr (url_status); + url_status_ptr->status (ACE_OS::atoi (status.c_str ())); + this->url_.reply_status (**url_status_ptr); + // Invalid url. + if (url_status_ptr->status () != 200) + return -1; + } + else + { + + if (line.find ("text/html") != ACE_CString::npos) + { + ACE_CString url_content_type("text/html"); + this->url_.content_type (url_content_type); + } + } + } + return 0; + +} + +HTML_Body_Validation_Strategy::HTML_Body_Validation_Strategy (URL &url, + URL_Iterator &iterator, + URL_Validation_Visitor &context) + : URL_Processing_Strategy (url, iterator), + visitor_context_ (context) +{ +} + +int +HTML_Body_Validation_Strategy::execute (void) +{ + char host_name_buf[BUFSIZ + 1]; + ACE_CString host_name (host_name_buf); + host_name.set (url_.url_addr ().get_host_name (),1); + + // All to facilitate relative paths + char temp[BUFSIZ + 1]; + ACE_CString prev_location (temp); + + prev_location.set (ACE_TEXT_ALWAYS_CHAR (this->url_.url_addr ().get_path_name ()), + ACE_OS::strlen (this->url_.url_addr ().get_path_name ()), + 1); + int index = prev_location.rfind ('/', prev_location.length ()); + ACE_CString str = prev_location.substring (0, index + 1); + prev_location.set (str.c_str (), 1); + + // Note: prev_location always ends with '/' + if (prev_location[0] != '/') + prev_location = "/" + prev_location; + + // Build the url portion which can be attached to teh relative paths. + prev_location = host_name + prev_location; + + char url_string[BUFSIZ + 1]; + ACE_CString url (url_string); + + while (this->iterator_.next (url) > 0) + { + // Check for relative urls.Strip out "http://" if its there. + if (url.find ("http") == url.npos) + { + if (url[0] == '.' && url[1] == '.') + { + url.set (&url[3], 1); + int i = prev_location.rfind ('/', prev_location.length () - 1); + prev_location = prev_location.substring (0, i+1); + } + if (url[0] == '.' && url[1] == '/') + url.set (&url[2], 1); + + url = prev_location + url; + } + else + url.set (&url[7], 1); + // Double slash at the end works!e.g www.cs.wustl.edu/~kirthika// + if (url.find (".html") == url.npos) + url = url + "/"; + + // Create the new URL address. + ACE_URL_Addr *url_addr; + ACE_NEW_RETURN (url_addr, + ACE_URL_Addr, + 0); + Auto_Destroyer<ACE_URL_Addr> url_addr_ptr (url_addr); + if (url_addr_ptr->string_to_addr (ACE_TEXT_CHAR_TO_TCHAR (url.c_str ())) == 0) + { + HTTP_URL *http_url; + ACE_NEW_RETURN (http_url, + HTTP_URL (**url_addr_ptr, + dynamic_cast<HTTP_URL *> (&this->url_)), + 0); + URL_Command *url_command; + ACE_NEW_RETURN (url_command, + URL_Command (http_url), + 0); + + OPTIONS::instance ()->command_processor ()->insert (url_command); + } + } + return 0; +} + +URL_Iterator * +URL_Validation_Visitation_Strategy_Factory::make_header_iterator (void) +{ + URL_Iterator *i; + ACE_NEW_RETURN (i, + HTTP_Header_Iterator (*this->url_), + 0); + return i; +} + +URL_Iterator * +URL_Validation_Visitation_Strategy_Factory::make_body_iterator (void) +{ + URL_Iterator *i; + ACE_NEW_RETURN (i, + HTML_Body_Iterator (*this->url_), + 0); + return i; +} + +URL_Processing_Strategy * +URL_Validation_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator) +{ + URL_Processing_Strategy *ps; + ACE_NEW_RETURN (ps, + HTTP_Header_Processing_Strategy (*this->url_, + iterator), + 0); + return ps; +} + +URL_Processing_Strategy * +URL_Validation_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator) +{ + URL_Processing_Strategy *ps; + ACE_NEW_RETURN (ps, + HTML_Body_Validation_Strategy (*this->url_, + iterator, + this->visitor_context_), + 0); + return ps; +} + +int +URL_Validation_Visitation_Strategy_Factory::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} + +URL_Visitor::~URL_Visitor (void) +{ +} + +URL_Validation_Visitor::URL_Validation_Visitor (void) +{ + ACE_NEW (this->caching_connect_strategy_, + CACHED_CONNECT_STRATEGY (this->caching_strategy_)); + ACE_NEW (this->strat_connector_, + STRATEGY_CONNECTOR(0, + &creation_strategy_, + caching_connect_strategy_, + &activation_strategy_)); + if (strat_connector_ == 0) + ACE_ERROR ((LM_ERROR, + "%p %s\n" + "strategy connector creation failed")); + + +} + +URL_Validation_Visitor::~URL_Validation_Visitor (void) +{ + this->strat_connector_ = 0; + if (this->caching_connect_strategy_ != 0) + delete this->caching_connect_strategy_; +} + +URL_Validation_Visitor::URL_CACHE & +URL_Validation_Visitor::url_cache (void) +{ + return this->url_cache_; +} + +int +URL_Validation_Visitor::in_cache (const ACE_URL_Addr &url_addr) +{ + URL_Status reply_status (URL_Status::STATUS_CODE (1)); + + if (this->url_cache_.find (url_addr, reply_status) == 0) + { + ACE_DEBUG ((LM_DEBUG, + "status %d for URL %s (cached)\n", + reply_status.status (), + url_addr.addr_to_string (0))); + + // Invalid status. + if (reply_status.status () != 200) + return -1; + + return 1; + } + else + return 0; +} + +URL_Visitation_Strategy_Factory * +URL_Validation_Visitor::make_visitation_strategy_factory (URL &url) +{ + // Since this is HTTP 1.1 we'll need to establish a connection + // only once. Trying for relative paths. + + if (url.stream ().open (this->strat_connector_, + url.url_addr ()) == -1) + return 0; + + // See if we can get connected and send the GET request via the + // <HTTP_URL>. + int result = url.send_request (); + if (result == -1) + { + ACE_ERROR ((LM_ERROR, + "%p\n", + "send_request")); + if (this->url_cache_.bind (url.url_addr (), + URL_Status (URL_Status::STATUS_SERVICE_UNAVAILABLE)) == -1) + ACE_ERROR ((LM_ERROR, + "%p\n", + "bind")); + return 0; + } + // @@ Here's where we could check to see if the <url> was HTTP or + // FTP, etc. But for now we'll just assume that everything is an + // HTTP URL. + else + { + + URL_Visitation_Strategy_Factory *vs; + ACE_NEW_RETURN (vs, + URL_Validation_Visitation_Strategy_Factory (&url, + *this), + 0); + return vs; + } +} + +int +URL_Validation_Visitor::destroy (void) +{ + delete this->strat_connector_; + // Commit suicide. + delete this; + return 0; +} + +int +URL_Validation_Visitor::visit (HTTP_URL &http_url) +{ + int result = this->in_cache (http_url.url_addr ()); + if (result == 0) + { + Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url)); + + if (*vs == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_visitation_strategy_factory"), + -1); + + Auto_Destroyer <URL_Iterator> ihs (vs->make_header_iterator ()); + if (*ihs == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_header_iterator"), + -1); + Auto_Destroyer <URL_Processing_Strategy> phs (vs->make_header_strategy (**ihs)); + if (*phs == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_header_strategy"), + -1); + int phs_result = phs->execute (); + if (phs_result == -1) + ACE_DEBUG ((LM_DEBUG, + "Invalid ")); + + ACE_DEBUG ((LM_DEBUG, + "URL with status %d %s\n", + http_url.reply_status ().status (), + http_url.url_addr().addr_to_string (0))); + + // Store the http url in the cache. + if (this->url_cache ().bind (http_url.url_addr (), + http_url.reply_status ()) != 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n","url_cache.bind"), + -1); + + // Since it is invalid dont go further. + if (phs_result == -1) + return 0; + + // Get back if the recurse option isnt set. + if (OPTIONS::instance ()->recurse () != 1) + return 0; + + Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ()); + if (*is == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_body_iterator"), + -1); + + Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is)); + if (*ps == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_body_strategy"), + -1); + + if (ps->execute () == -1) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "body execute"), + -1); + + } + return 0; +} + +int +URL_Download_Visitation_Strategy_Factory::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} + +URL_Iterator * +URL_Download_Visitation_Strategy_Factory::make_header_iterator (void) +{ + return 0; +} + +URL_Iterator * +URL_Download_Visitation_Strategy_Factory::make_body_iterator (void) +{ + URL_Iterator *i; + ACE_NEW_RETURN (i, + URL_Download_Iterator (*this->url_), + 0); + return i; +} + +URL_Processing_Strategy * +URL_Download_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator) +{ + // You fill in here. + ACE_UNUSED_ARG (iterator); + + return 0; +} + +URL_Processing_Strategy * +URL_Download_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator) +{ + URL_Processing_Strategy *ps; + ACE_NEW_RETURN (ps, + URL_Download_Strategy (*this->url_, + iterator), + 0); + return ps; +} + +URL_Visitation_Strategy_Factory::URL_Visitation_Strategy_Factory (URL *url) + : url_ (url) +{ +} + +URL_Visitation_Strategy_Factory::~URL_Visitation_Strategy_Factory (void) +{ +} + +URL_Download_Visitation_Strategy_Factory::URL_Download_Visitation_Strategy_Factory (URL *url) + : URL_Visitation_Strategy_Factory (url) +{ +} + +URL_Validation_Visitation_Strategy_Factory::URL_Validation_Visitation_Strategy_Factory (URL *url, + URL_Validation_Visitor &visitor_context) + : URL_Visitation_Strategy_Factory (url), + visitor_context_ (visitor_context) +{ +} + +URL_Visitation_Strategy_Factory * +URL_Download_Visitor::make_visitation_strategy_factory (URL &url) +{ + // See if we can get connected and send the GET request via the + // <HTTP_URL>. + while (1) + { + int retval = url.send_request (); + if (retval != -1) + break; + + } + // @@ Here's where we could check to see if the <url> was HTTP or + // FTP, etc. But for now we'll just assume that everything is an + // HTTP URL. + URL_Visitation_Strategy_Factory *vs; + ACE_NEW_RETURN (vs, + URL_Download_Visitation_Strategy_Factory (&url), + 0); + return vs; + +} + +int +URL_Download_Visitor::destroy (void) +{ + // Commit suicide. + delete this; + return 0; +} + +int +URL_Download_Visitor::visit (HTTP_URL &http_url) +{ + Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url)); + + if (*vs == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_visitation_strategy_factory"), + -1); + + Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ()); + if (*is == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_body_iterator"), + -1); + + Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is)); + if (*ps == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_body_strategy"), + -1); + + if (ps->execute () == -1) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "body execute"), + -1); + return 0; +} diff --git a/ACE/examples/Web_Crawler/URL_Visitor.h b/ACE/examples/Web_Crawler/URL_Visitor.h new file mode 100644 index 00000000000..9f68612d629 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Visitor.h @@ -0,0 +1,436 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// URL_Visitor.h +// +// = AUTHOR +// Douglas C.Schmidt <schmidt@cs.wustl.edu> +// Kirthika Parameswaran <kirthika@cs.wustl.edu> +// ============================================================================ + +#ifndef _URL_VISITOR_H +#define _URL_VISITOR_H +#include /**/ "ace/pre.h" + +#include "ace/Strategies_T.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + + +#include "HTTP_URL.h" +#include "Iterators.h" +#include "ace/Hash_Map_Manager_T.h" +#include "ace/Caching_Strategies_T.h" +#include "ace/Cached_Connect_Strategy_T.h" +#include "Options.h" +#include "ace/Pair_T.h" + +// Forward declarations. +class URL_Validation_Visitor; + +class URL_Processing_Strategy +{ + // = TITLE + // Abstract base class for the URL processing strategy. + // + // = DESCRIPTION +public: + URL_Processing_Strategy (URL &, + URL_Iterator &); + // Constructor. + + virtual ~URL_Processing_Strategy (void); + + virtual int execute (void) = 0; + // Perform the strategy. + + virtual int destroy (void); + + // Close down the resources. + +protected: + URL &url_; + // A reference to the URL "context" that we're processing. + + URL_Iterator &iterator_; + // Iterator for the URL that we're processing. +}; + +class HTTP_Header_Processing_Strategy : public URL_Processing_Strategy +{ + // = TITLE + // Defines the HTTP header processing strategy. + // + // = DESCRIPTION +public: + HTTP_Header_Processing_Strategy (URL &, + URL_Iterator &); + // Constructor. + + virtual int execute (void); + // Perform the strategy for processing an HTTP header. +}; + +class HTML_Body_Validation_Strategy : public URL_Processing_Strategy +{ + // = TITLE + // Defines the HTML body processing strategy. + // + // = DESCRIPTION + // This class iterates through the body of an HTML file and + // recursively visits embedded links. +public: + HTML_Body_Validation_Strategy (URL &, + URL_Iterator &, + URL_Validation_Visitor &); + // Constructor. + + virtual int execute (void); + // Perform the strategy for processing an HTML file. This strategy + // iterates over the HTML file and recursively visits embedded links + // to process them, as well. + +private: + URL_Validation_Visitor &visitor_context_; + // This is the context of the visit. +}; + +class URL_Download_Strategy : public URL_Processing_Strategy +{ + // = TITLE + // Defines a URL downloading strategy. + // + // = DESCRIPTION + // This class downloads a URL's contents into a temporary file. +public: + URL_Download_Strategy (URL &, + URL_Iterator &); + // Constructor. + + virtual int execute (void); + // Perform the strategy for downloading a URL to a temporary file. +}; + +class URL_Visitation_Strategy_Factory +{ + // = TITLE + // Abstract Factory for the URL visitation strategy. + // + // = DESCRIPTION +public: + URL_Visitation_Strategy_Factory (URL *); + + /// Destructor. + virtual ~URL_Visitation_Strategy_Factory (void); + + // = Factory Methods. + virtual URL_Iterator *make_header_iterator (void) = 0; + // Factory Method that makes the header iterator. + + virtual URL_Iterator *make_body_iterator (void) = 0; + // Factory Method that makes the body iterator. + + virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &) = 0; + // Factory Method that makes the header processing strategy. + + virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &) = 0; + // Factory Method that makes the body processing strategy . + + virtual int destroy (void) = 0; + // Close down the resources. + +protected: + URL *url_; + // Stash the URL so we don't have to pass it around. +}; + +class URL_Download_Visitation_Strategy_Factory : public URL_Visitation_Strategy_Factory +{ + // = TITLE + // Concrete Factory for the URL validation visitation strategy. + // + // = DESCRIPTION +public: + URL_Download_Visitation_Strategy_Factory (URL *); + // Constructor. + + // = Factory Methods. + virtual URL_Iterator *make_header_iterator (void); + // Factory Method that makes an <HTTP_Header_Iterator>. + + virtual URL_Iterator *make_body_iterator (void); + // Factory Method that makes an <HTML_Body_Iterator>. + + virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &); + // Factory Method that makes the header processing strategy. + + virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &); + // Factory Method that makes the body processing strategy . + + virtual int destroy (void); + // Close down the resources. +}; + +class URL_Validation_Visitation_Strategy_Factory : public URL_Visitation_Strategy_Factory +{ + // = TITLE + // Concrete Factory for the URL validation visitation strategy. + // + // = DESCRIPTION +public: + URL_Validation_Visitation_Strategy_Factory (URL *, + URL_Validation_Visitor &); + // Constructor. + + // = Factory Methods. + virtual URL_Iterator *make_header_iterator (void); + // Factory Method that makes an <HTTP_Header_Iterator>. + + virtual URL_Iterator *make_body_iterator (void); + // Factory Method that makes an <HTML_Body_Iterator>. + + virtual URL_Processing_Strategy *make_header_strategy (URL_Iterator &); + // Factory Method that makes the header processing strategy. + + virtual URL_Processing_Strategy *make_body_strategy (URL_Iterator &); + // Factory Method that makes the body processing strategy . + + virtual int destroy (void); + // Close down the resources. + +private: + URL_Validation_Visitor &visitor_context_; + // Context of the visitor. +}; + +class URL_Visitor +{ + // = TITLE + // Base class for the URL Visitor. + // + // = DESCRIPTION + // This class plays the "visitor" role in the Visitor pattern. +public: + + virtual ~URL_Visitor (void); + + virtual int visit (HTTP_URL &http_url) = 0; + // Visit an <HTTP_URL>. + + // @@ + // virtual int visit (FTP_URL &http_url) = 0; + + virtual int destroy (void) = 0; + // Cleanup the resources. + +protected: + virtual URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &) = 0; + // Make the appropriate <URL_Visitation_Strategy_Factory>. +}; + +typedef int ATTRIBUTES; +typedef ACE_Svc_Handler <ACE_SOCK_STREAM, ACE_NULL_SYNCH> + Client_Svc_Handler; +typedef ACE_Pair<Client_Svc_Handler *, ATTRIBUTES> + CACHED_HANDLER; +typedef ACE_Refcounted_Hash_Recyclable<ACE_INET_Addr> + ACE_ADDR; +typedef ACE_Hash<ACE_ADDR> H_KEY; +typedef ACE_Equal_To<ACE_ADDR> C_KEYS; + +typedef ACE_Hash_Map_Manager_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex> + HASH_MAP; +typedef ACE_Hash_Map_Iterator_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex> + HASH_MAP_ITERATOR; +typedef ACE_Hash_Map_Reverse_Iterator_Ex<ACE_ADDR, CACHED_HANDLER, H_KEY, C_KEYS, ACE_Null_Mutex> + HASH_MAP_REVERSE_ITERATOR; + +typedef ACE_Recyclable_Handler_Cleanup_Strategy<ACE_ADDR, CACHED_HANDLER, HASH_MAP> + CLEANUP_STRATEGY; +typedef ACE_Recyclable_Handler_Caching_Utility<ACE_ADDR, CACHED_HANDLER, HASH_MAP, HASH_MAP_ITERATOR, ATTRIBUTES> + CACHING_UTILITY; + +typedef ACE_LRU_Caching_Strategy<ATTRIBUTES, CACHING_UTILITY> + LRU_CACHING_STRATEGY; + +typedef LRU_CACHING_STRATEGY + CACHING_STRATEGY; + +typedef ACE_Strategy_Connector<Client_Svc_Handler, ACE_SOCK_CONNECTOR> + STRATEGY_CONNECTOR; + +typedef ACE_NOOP_Creation_Strategy<Client_Svc_Handler> + NULL_CREATION_STRATEGY; + +typedef ACE_NOOP_Concurrency_Strategy<Client_Svc_Handler> + NULL_ACTIVATION_STRATEGY; + +typedef ACE_Cached_Connect_Strategy_Ex<Client_Svc_Handler, ACE_SOCK_CONNECTOR, CACHING_STRATEGY, ATTRIBUTES, ACE_SYNCH_NULL_MUTEX> + CACHED_CONNECT_STRATEGY; + +class URL_Validation_Visitor : public URL_Visitor +{ + // = TITLE + // Subclass that defines the URL validation visitor. + // + // = DESCRIPTION + // This class checks to make sure that the <HTTP_URL> is valid. + // If the <HTTP_URL> is an <HTML> file, it can also be used to + // recursively check that all embedded links in this file are + // valid. +public: + typedef ACE_Hash_Map_Manager <ACE_URL_Addr, URL_Status, ACE_Null_Mutex> + URL_CACHE; + + virtual int visit (HTTP_URL &http_url); + // Visit an <HTTP_URL> to make sure that it's valid. If the content + // type of the <HTTP_URL> is "text/html" and the <recursion> option + // is enabled then <visit> recursively checks each link embedded in + // the HTML page. + + // @@ + // virtual int visit (FTP_URL &http_url); + + URL_Validation_Visitor (void); + virtual int destroy (void); + // Cleanup the resources. + + URL_CACHE &url_cache (void); + // Returns a reference to the URL cache. + + + /* + + + typedef ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_NULL_SYNCH> + Svc_Handler; + typedef ACE_Strategy_Connector<Svc_Handler, ACE_SOCK_CONNECTOR> + STRAT_CONNECTOR; + typedef ACE_Refcounted_Hash_Recyclable<ACE_INET_Addr> + REFCOUNTED_HASH_RECYCLABLE_ADDRESS; + typedef ACE_NOOP_Creation_Strategy<Svc_Handler> + NULL_CREATION_STRATEGY; + typedef ACE_NOOP_Concurrency_Strategy<Svc_Handler> + NULL_ACTIVATION_STRATEGY; + + typedef ACE_Hash_Map_Manager_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\ + ACE_Pair<Svc_Handler *, int>,\ + ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \ + ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\ + ACE_Null_Mutex> + CONNECTION_HASH_MAP; + typedef ACE_Hash_Map_Iterator_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\ + ACE_Pair<Svc_Handler *, int>,\ + ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \ + ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\ + ACE_Null_Mutex> + CONNECTION_HASH_MAP_ITERATOR; + typedef ACE_Hash_Map_Reverse_Iterator_Ex<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\ + ACE_Pair<Svc_Handler *, int>,\ + ACE_Hash<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>, \ + ACE_Equal_To<REFCOUNTED_HASH_RECYCLABLE_ADDRESS>,\ + ACE_Null_Mutex> + CONNECTION_HASH_MAP_REVERSE_ITERATOR; + typedef ACE_Pair_Caching_Utility <REFCOUNTED_HASH_RECYCLABLE_ADDRESS, \ + ACE_Pair<Svc_Handler *, int>, \ + CONNECTION_HASH_MAP, CONNECTION_HASH_MAP_ITERATOR, int > + CACHING_STRATEGY_UTILITY; + typedef ACE_LRU_Caching_Strategy<REFCOUNTED_HASH_RECYCLABLE_ADDRESS,\ + ACE_Pair<Svc_Handler *, int>,\ + CONNECTION_HASH_MAP, int,\ + CACHING_STRATEGY_UTILITY > + LRU; + typedef ACE_Cached_Connect_Strategy_Ex<Svc_Handler,ACE_SOCK_CONNECTOR, LRU,int, ACE_SYNCH_NULL_MUTEX> + CACHED_CONNECT_STRATEGY; + */ +protected: + virtual ~URL_Validation_Visitor (void); + virtual URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &); + // Factory Method that makes a + // <URL_Validation_Visitation_Strategy_Factory>. + + URL_CACHE url_cache_; + // Cache the status of URLs we've already validated. + + int in_cache (const ACE_URL_Addr &url_addr); + // Check to see if the reply status of this <url_addr> is in the + // cache. Returns 1 if so, 0 if not. + + NULL_CREATION_STRATEGY creation_strategy_; + NULL_ACTIVATION_STRATEGY activation_strategy_; + + // Configure the Strategy Connector with a strategy that caches + // connection. + CACHED_CONNECT_STRATEGY *caching_connect_strategy_; + + STRATEGY_CONNECTOR *strat_connector_; + + CACHING_STRATEGY caching_strategy_; +}; + + +class URL_Download_Visitor : public URL_Visitor +{ + // = TITLE + // Subclass for the URL validtion visitor. + // + // = DESCRIPTION + // This class checks to make sure that the <HTTP_URL> is valid. +public: + virtual int visit (HTTP_URL &http_url); + // Visit an <HTTP_URL> to make sure that it's valid. If the content + // type of the <HTTP_URL> is "text/html" and the <recursion> option + // is enabled then <visit> recursively checks each link embedded in + // the HTML page. + + // @@ + // virtual int visit (FTP_URL &http_url); + + virtual int destroy (void); + // Cleanup the resources. + +protected: + URL_Visitation_Strategy_Factory *make_visitation_strategy_factory (URL &); + // Factory Method that makes a <URL_Download_Visitation_Strategy_Factory>. +}; + +template <class T> +class Auto_Destroyer +{ + // = TITLE + // Simple class that ensures the <destroy> method is called on our + // <URL_*> objects when they go out of scope. + // + // = DESCRIPTION + // This class is similar to an auto_ptr<> and should be used to + // simplify blocks of code that must create/destroy pointers to + // various <URL_*> related strategies and iterators. +public: + Auto_Destroyer (T *t): t_ (t) {} + T *operator-> (void) { return this->t_; } + T *operator *(void) { return this->t_; } + void operator= (T *t) + { + if (this->t_ != 0) + this->t_->destroy (); + this->t_ = t; + } + ~Auto_Destroyer (void) + { + if (this->t_ != 0) + t_->destroy (); + } +private: + T *t_; +}; + +#include /**/ "ace/post.h" +#endif /* _URL_VISITOR_H */ diff --git a/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp b/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp new file mode 100644 index 00000000000..1b8a316b219 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Visitor_Factory.cpp @@ -0,0 +1,53 @@ +/* -*- C++ -*- */ +// $Id$ + +#include "URL_Visitor_Factory.h" + +ACE_RCSID (Web_Crawler, + URL_Visitor_Factory, + "$Id$") + + +URL_Visitor_Factory::~URL_Visitor_Factory (void) +{ +} + +URL_Visitor * +URL_Validation_Visitor_Factory::make_visitor (void) +{ + URL_Visitor *v; + + ACE_NEW_RETURN (v, + URL_Validation_Visitor, + 0); + + return v; +} + +Command_Processor * +URL_Validation_Visitor_Factory::make_command_processor (void) +{ + Command_Processor *cp; + + ACE_NEW_RETURN (cp, + Command_Processor, + 0); + return cp; +} + +URL_Visitor * +URL_Download_Visitor_Factory::make_visitor (void) +{ + URL_Visitor *v; + + ACE_NEW_RETURN (v, + URL_Download_Visitor, + 0); + return v; +} + +Command_Processor * +URL_Download_Visitor_Factory::make_command_processor (void) +{ + return 0; +} diff --git a/ACE/examples/Web_Crawler/URL_Visitor_Factory.h b/ACE/examples/Web_Crawler/URL_Visitor_Factory.h new file mode 100644 index 00000000000..9f484afe9f0 --- /dev/null +++ b/ACE/examples/Web_Crawler/URL_Visitor_Factory.h @@ -0,0 +1,74 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// URL_Visitor_Factory.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _URL_VISITOR_FACTORY_H +#define _URL_VISITOR_FACTORY_H + +#include "URL_Visitor.h" +#include "Command_Processor.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +class URL_Visitor_Factory +{ + // = TITLE + // Abstract base class that creates URL visitors. + // + // = DESCRIPTION + // Subclasses define each of the Factory Methods to + // make the right objects, which all "vary" together. +public: + + /// Destructor. + virtual ~URL_Visitor_Factory (void); + + virtual URL_Visitor *make_visitor (void) = 0; + // Factory Method that makes the appropriate type of <URL_Visitor>. + + virtual Command_Processor *make_command_processor (void) = 0; + // Factory Method that makes the appropriate type of + // <Command_Processor>. +}; + +class URL_Validation_Visitor_Factory : public URL_Visitor_Factory +{ + // = TITLE + // Create a URL visitor that validates URL links. +public: + virtual URL_Visitor *make_visitor (void); + // Factory Method that makes a <URL_Validation_Visitor>. + + virtual Command_Processor *make_command_processor (void); + // Factory Method that makes a <FIFO_Command_Processor>. + + +}; + +class URL_Download_Visitor_Factory : public URL_Visitor_Factory +{ + // = TITLE + // Create a URL visitor that downloads URL links. +public: + virtual URL_Visitor *make_visitor (void); + // Factory Method that makes a <URL_Download_Visitor>. + + virtual Command_Processor *make_command_processor (void); + // Factory Method that makes a <FIFO_Command_Processor>. +}; + +#endif /* _URL_VISITOR_FACTORY_H */ diff --git a/ACE/examples/Web_Crawler/Web_Crawler.cpp b/ACE/examples/Web_Crawler/Web_Crawler.cpp new file mode 100644 index 00000000000..16639a38d73 --- /dev/null +++ b/ACE/examples/Web_Crawler/Web_Crawler.cpp @@ -0,0 +1,95 @@ +// $Id$ + +#include "Options.h" +#include "URL_Visitor_Factory.h" +#include "Web_Crawler.h" + +ACE_RCSID(Web_Crawler, Web_Crawler, "$Id$") + +Web_Crawler::~Web_Crawler (void) +{ + delete this->url_visitor_factory_; +} + +Web_Crawler::Web_Crawler (void) + : url_visitor_factory_ (0) +{ +} + +int +Web_Crawler::open (int argc, ACE_TCHAR *argv[]) +{ + if (OPTIONS::instance ()->parse_args (argc, argv) == -1) + return -1; + // @@ Put the ACE_Service_Config::open() stuff here somewhere... + else + { + // For now just hardcode this to create "validation" visitors. + ACE_NEW_RETURN (this->url_visitor_factory_, + URL_Validation_Visitor_Factory, + -1); + return 0; + } +} + +int +Web_Crawler::run (void) +{ + // Make the appropriate <URL_Visitor>. + Auto_Destroyer<URL_Visitor> visitor (this->url_visitor_factory_->make_visitor ()); + + if (*visitor == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_visitor"), + -1); + + // Make the appropriate <Command_Processor>. + Auto_Destroyer<Command_Processor> cp (this->url_visitor_factory_->make_command_processor ()); + + if (*cp == 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", + "make_command_processor"), + -1); + + // Set the <Command_Processor> in the <Options> to make it visible. + OPTIONS::instance ()->command_processor (*cp); + + // Set the <URL_Visitor> in the <Options> to make it visible. + OPTIONS::instance ()->visitor (*visitor); + + // @@ You fill in here... + ACE_URL_Addr *url_addr; + ACE_NEW_RETURN (url_addr, + ACE_URL_Addr (OPTIONS::instance()->hostname (), + OPTIONS::instance()->path_name (), + OPTIONS::instance()->port_no ()), //KIRTHIKA + 0); + Auto_Destroyer<ACE_URL_Addr> url_addr_ptr (url_addr); + + HTTP_URL *http_url; + ACE_NEW_RETURN (http_url, + HTTP_URL (**url_addr_ptr), + 0); + + Auto_Destroyer<HTTP_URL> http_url_ptr (http_url); + + URL_Command *url_command; + ACE_NEW_RETURN (url_command, + URL_Command (*http_url_ptr), + 0); + // Auto_Destroyer<URL_Command> url_command_ptr (url_command); + + if (cp->insert (url_command) != 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", "insert"), + -1); + + if (cp->execute () != 0) + ACE_ERROR_RETURN ((LM_ERROR, + "%p\n", "execute"), + -1); + return 0; +} + diff --git a/ACE/examples/Web_Crawler/Web_Crawler.h b/ACE/examples/Web_Crawler/Web_Crawler.h new file mode 100644 index 00000000000..01e275e2187 --- /dev/null +++ b/ACE/examples/Web_Crawler/Web_Crawler.h @@ -0,0 +1,62 @@ +/* -*- C++ -*- */ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// Web_Crawler.h +// +// = AUTHOR +// Douglas C. Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#ifndef _WEB_CRAWLER_H +#define _WEB_CRAWLER_H + +#include "URL_Addr.h" +#include "HTTP_URL.h" + +#if !defined (ACE_LACKS_PRAGMA_ONCE) +#pragma once +#endif /* ACE_LACKS_PRAGMA_ONCE */ + +// Forward declaration. +class URL_Visitor_Factory; + +class Web_Crawler +{ + // = TITLE + // An abstraction for a Web Crawler. + // + // = DESCRIPTION + // This class is a Facade that organizes the other classes in the + // solution, which include a factory that creates a visitor, + // which in turn embodies the appropriate visitation strategy. +public: + // = Initialization and termination methods. + Web_Crawler (void); + // Constructor. + + ~Web_Crawler (void); + // Destructor. + + int open (int argc, ACE_TCHAR *argv[]); + // Parses the command-line options and initializes the + // <URL_Visitor_Factory>. + + int run (void); + // Run the Web Crawler and carries out whatever visitation strategy + // is configured. Returns -1 on failure and 0 on success. + +private: + URL_Visitor_Factory *url_visitor_factory_; + // Pointer to a factory that creates visitors that explore URLs and + // perform various tasks. Subclasses of <URL_Visitor_Factory> + // determine what happens during a visitation. +}; + +#endif /* _WEB_CRAWLER_H */ diff --git a/ACE/examples/Web_Crawler/Web_Crawler.mpc b/ACE/examples/Web_Crawler/Web_Crawler.mpc new file mode 100644 index 00000000000..7750d7cbd5d --- /dev/null +++ b/ACE/examples/Web_Crawler/Web_Crawler.mpc @@ -0,0 +1,7 @@ +// -*- MPC -*- +// $Id$ + +project : aceexe { + avoids += ace_for_tao + exename = main +} diff --git a/ACE/examples/Web_Crawler/main.cpp b/ACE/examples/Web_Crawler/main.cpp new file mode 100644 index 00000000000..1735f811b78 --- /dev/null +++ b/ACE/examples/Web_Crawler/main.cpp @@ -0,0 +1,51 @@ +// $Id$ + +// ============================================================================ +// +// = LIBRARY +// examples/Web_Crawler +// +// = FILENAME +// main.cpp +// +// = DESCRIPTION +// This program implements a Web crawler that can be configured to +// apply various strategies to URLs that it visits. +// +// = AUTHOR +// Doug Schmidt <schmidt@cs.wustl.edu> +// +// ============================================================================ + +#include "ace/OS_main.h" +#include "ace/Signal.h" +#include "Web_Crawler.h" +#include "Options.h" + +ACE_RCSID(Web_Crawler, main, "$Id$") + +void sig_handler (int) +{ + ACE_DEBUG ((LM_DEBUG, + ACE_TEXT ("aborting!\n"))); + ACE_OS::abort (); +} + +int +ACE_TMAIN (int argc, ACE_TCHAR *argv[]) +{ +#if !defined (ACE_HAS_WINCE) + ACE_Sig_Action sa ((ACE_SignalHandler) sig_handler, SIGFPE); +#endif + Web_Crawler crawler; + + if (crawler.open (argc, argv) == -1) + return 1; + else if (crawler.run () == -1) + return 1; + else + return 0; +} + + + |