summaryrefslogtreecommitdiff
path: root/ACE/examples/Web_Crawler/Iterators.h
blob: 27b7ae460527665d6dce4c0bf6ae47d43df3f876 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* -*- C++ -*- */

//=============================================================================
/**
 *  @file    Iterators.h
 *
 *  $Id$
 *
 *  @author Douglas C. Schmidt <schmidt@cs.wustl.edu>
 */
//=============================================================================


#ifndef _ITERATORS_H
#define _ITERATORS_H

#include "URL.h"

#if !defined (ACE_LACKS_PRAGMA_ONCE)
#pragma once
#endif /* ACE_LACKS_PRAGMA_ONCE */

/**
 * @class URL_Iterator
 *
 * @brief An abstract base class that defines an iterator.
 *
 * Subclasses of this base class can define what strings
 * to return from <next>.  This class decouples higher-level
 * software from the details of whatever type of URL header or
 * body we're iterating over.
 */
class URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// "virtual" destructor.
  virtual int destroy (void);

  // = Iterator methods.
  /// Pass back the next <string> that hasn't been seen yet.  Returns 0
  /// when all items have been seen, else 1.
  virtual int next (ACE_CString &string) = 0;

protected:
  /// C++ destructor.
  virtual ~URL_Iterator (void);
};

/**
 * @class HTML_Body_Iterator
 *
 * @brief An iterator that returns URLs embedded in HTML files.
 */
class HTML_Body_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  HTML_Body_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <url> that hasn't been seen in the
   * memory-mapped file.  Returns 0 when all items have been seen,
   * else 1.
   */
  virtual int next (ACE_CString &url);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;
};

/**
 * @class HTTP_Header_Iterator
 *
 * @brief An iterator that iterates over the HTTP header.
 */
class HTTP_Header_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  HTTP_Header_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <line> that hasn't been seen in the
   * memory-mapped file header.  Returns 0 when we've reached the end
   * of the header.  seen, else 1.
   */
  virtual int next (ACE_CString &line);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;

  /// We've found the end of the header, which means this iterator is
  /// finished.
  int end_of_header_;
};

/**
 * @class URL_Download_Iterator
 *
 * @brief An iterator that iterates over the contents of an entire URL,
 * i.e., both header and body, and returns it in <BUFSIZ>
 * <buffer>s.
 */
class URL_Download_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  URL_Download_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <buffer> data from the stream, where
   * <buffer.size> <= <BUFSIZ> .  Returns 0 when we've reached the end
   * of the header, else 1.
   */
  virtual int next (ACE_CString &buffer);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;
};

#endif /* _ITERATORS_H */