summaryrefslogtreecommitdiff
path: root/ACE/examples/Web_Crawler/Iterators.h
blob: dc1dcc6a2376f417b988f8d993e54aeb23a9a9c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* -*- C++ -*- */

//=============================================================================
/**
 *  @file    Iterators.h
 *
 *  @author Douglas C. Schmidt <d.schmidt@vanderbilt.edu>
 */
//=============================================================================


#ifndef _ITERATORS_H
#define _ITERATORS_H

#include "URL.h"

#if !defined (ACE_LACKS_PRAGMA_ONCE)
#pragma once
#endif /* ACE_LACKS_PRAGMA_ONCE */

/**
 * @class URL_Iterator
 *
 * @brief An abstract base class that defines an iterator.
 *
 * Subclasses of this base class can define what strings
 * to return from <next>.  This class decouples higher-level
 * software from the details of whatever type of URL header or
 * body we're iterating over.
 */
class URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// "virtual" destructor.
  virtual int destroy (void);

  // = Iterator methods.
  /// Pass back the next <string> that hasn't been seen yet.  Returns 0
  /// when all items have been seen, else 1.
  virtual int next (ACE_CString &string) = 0;

protected:
  /// C++ destructor.
  virtual ~URL_Iterator (void);
};

/**
 * @class HTML_Body_Iterator
 *
 * @brief An iterator that returns URLs embedded in HTML files.
 */
class HTML_Body_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  HTML_Body_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <url> that hasn't been seen in the
   * memory-mapped file.  Returns 0 when all items have been seen,
   * else 1.
   */
  virtual int next (ACE_CString &url);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;
};

/**
 * @class HTTP_Header_Iterator
 *
 * @brief An iterator that iterates over the HTTP header.
 */
class HTTP_Header_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  HTTP_Header_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <line> that hasn't been seen in the
   * memory-mapped file header.  Returns 0 when we've reached the end
   * of the header.  seen, else 1.
   */
  virtual int next (ACE_CString &line);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;

  /// We've found the end of the header, which means this iterator is
  /// finished.
  int end_of_header_;
};

/**
 * @class URL_Download_Iterator
 *
 * @brief An iterator that iterates over the contents of an entire URL,
 * i.e., both header and body, and returns it in <BUFSIZ>
 * <buffer>s.
 */
class URL_Download_Iterator : public URL_Iterator
{
public:
  // = Initialization and termination methods.
  /// Constructor.
  URL_Download_Iterator (URL &url);

  // = Iterator methods.
  /**
   * Pass back the next <buffer> data from the stream, where
   * <buffer.size> <= <BUFSIZ> .  Returns 0 when we've reached the end
   * of the header, else 1.
   */
  virtual int next (ACE_CString &buffer);

private:
  /// HTTP URL that we're iterating over.
  URL &url_;
};

#endif /* _ITERATORS_H */