libs/flyweight/example/html.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

/* Boost.Flyweight example of flyweight-based formatted text processing.
 *
 * Copyright 2006-2014 Joaquin M Lopez Munoz.
 * Distributed under the Boost Software License, Version 1.0.
 * (See accompanying file LICENSE_1_0.txt or copy at
 * http://www.boost.org/LICENSE_1_0.txt)
 *
 * See http://www.boost.org/libs/flyweight for library home page.
 */

#include <boost/flyweight.hpp>
#include <boost/functional/hash.hpp>
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>

#if defined(BOOST_NO_STDC_NAMESPACE)
namespace std{using ::exit;using ::tolower;}
#endif

using namespace boost::flyweights;

/* An HTML tag consists of a name and optional properties of the form
 * name1=value1 ... namen=valuen. We do not need to parse the properties
 * for the purposes of the program, hence they are all stored in
 * html_tag_data::properties in raw form.
 */

struct html_tag_data
{
  std::string name;
  std::string properties;
};

bool operator==(const html_tag_data& x,const html_tag_data& y)
{
  return x.name==y.name&&x.properties==y.properties;
}

/* See the portability section of Boost.Hash at
 *   http://boost.org/doc/html/hash/portability.html
 * for an explanation of the ADL-related workarounds.
 */

#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
namespace boost{
#endif

std::size_t hash_value(const html_tag_data& x)
{
  std::size_t res=0;
  boost::hash_combine(res,x.name);
  boost::hash_combine(res,x.properties);
  return res;
}

#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
} /* namespace boost */
#endif

typedef flyweight<html_tag_data> html_tag;

/* parse_tag is passed an iterator positioned at the first char of
 * the tag after the opening '<' and returns, if succesful, a parsed tag
 * and whether it is opening (<xx>) or closing (</xx>).
 */

enum tag_type{opening,closing,failure};

struct parse_tag_res
{
  parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
    type(type_),tag(tag_){}
  parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}

  tag_type type;
  html_tag tag;
};

template<typename ForwardIterator>
parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
{
  html_tag_data  tag;
  std::string    buf;
  bool           in_quote=false;
  for(ForwardIterator it=first;it!=last;){
    char ch=*it++;
    if(ch=='>'&&!in_quote){             /* ignore '>'s if inside quotes */
      tag_type type;
      std::string::size_type
        bname=buf.find_first_not_of("\t\n\r "),
        ename=bname==std::string::npos?
          std::string::npos:
          buf.find_first_of("\t\n\r ",bname),
        bprop=ename==std::string::npos?
          std::string::npos:
          buf.find_first_not_of("\t\n\r ",ename);
      if(bname==ename){                 /* null name */
        return parse_tag_res(failure);
      }
      else if(buf[bname]=='/'){         /* closing tag */
        type=closing;
        ++bname;
      }
      else type=opening;
      tag.name=buf.substr(bname,ename-bname);      
      std::transform(                   /* normalize tag name to lower case */
        tag.name.begin(),tag.name.end(),tag.name.begin(),
        (int(*)(int))std::tolower);
      if(bprop!=std::string::npos){
        tag.properties=buf.substr(bprop,buf.size());
      }
      first=it;                         /* result good, consume the chars */
      return parse_tag_res(type,tag);      
    }
    else{
      if(ch=='"')in_quote=!in_quote;
      buf+=ch;
    }
  }
  return parse_tag_res(failure);        /* end reached and found no '>' */
}

/* A character context is just a vector containing the tags enclosing the
 * character, from the outermost level to the innermost.
 */

typedef std::vector<html_tag>        html_context_data;
typedef flyweight<html_context_data> html_context;

/* A character is a char code plus its context.
 */

struct character_data
{
  character_data(char code_=0,html_context context_=html_context()):
    code(code_),context(context_){}
  character_data(const character_data& x):code(x.code),context(x.context){}
    
  char         code;
  html_context context;
};

bool operator==(const character_data& x,const character_data& y)
{
  return x.code==y.code&&x.context==y.context;
}

#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
namespace boost{
#endif

std::size_t hash_value(const character_data& x)
{
  std::size_t res=0;
  boost::hash_combine(res,x.code);
  boost::hash_combine(res,x.context);
  return res;
}

#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
} /* namespace boost */
#endif

typedef flyweight<character_data> character;

/* scan_html converts HTML code into a stream of contextualized characters.
 */

template<typename ForwardIterator,typename OutputIterator>
void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
{
  html_context_data context;
  while(first!=last){
    if(*first=='<'){                                 /* tag found */
      ++first;
      parse_tag_res res=parse_tag(first,last);
      if(res.type==opening){                         /* add to contex */
        context.push_back(res.tag);
        continue;
      }
      else if(res.type==closing){                    /* remove from context */
        /* Pop all tags from the innermost to the matching one; this takes
         * care of missing </xx>s like vg. in <ul><li>hello</ul>.
         */

        for(html_context_data::reverse_iterator rit=context.rbegin();
            rit!=context.rend();++rit){
          if(rit->get().name==res.tag.get().name){
            context.erase(rit.base()-1,context.end());
            break;
          }
        }
        continue;
      }
    }
    *out++=character(*first++,html_context(context));
  }
}

/* HTML-producing utilities */

void print_opening_tag(std::ostream& os,const html_tag_data& x)
{
  os<<"<"<<x.name;
  if(!x.properties.empty())os<<" "<<x.properties;
  os<<">";
}

void print_closing_tag(std::ostream& os,const html_tag_data& x)
{
  /* SGML declarations (beginning with '!') are not closed */

  if(x.name[0]!='!')os<<"</"<<x.name<<">";
}

/* change_context takes contexts from and to with tags
 *
 *   from<- c1 ... cn fn+1 ... fm
 *   to  <- c1 ... cn tn+1 ... tk
 *
 * (that is, they share the first n tags, n might be 0), and
 * produces code closing fm ... fn+1 and opening tn+1 ... tk.
 */

template<typename OutputIterator>
void change_context(
  const html_context_data& from,const html_context_data& to,
  OutputIterator out)
{
  std::ostringstream oss;
  html_context_data::const_iterator
    it0=from.begin(),
    it0_end=from.end(),
    it1=to.begin(),
    it1_end=to.end();
  for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
  while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
  while(it1!=it1_end)print_opening_tag(oss,*it1++);
  std::string str=oss.str();
  std::copy(str.begin(),str.end(),out);
}

/* produce_html is passed a bunch of contextualized characters and emits
 * the corresponding HTML. The algorithm is simple: tags are opened and closed
 * as a result of the context from one character to the following changing.
 */

template<typename ForwardIterator,typename OutputIterator>
void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
{
  html_context context;
  while(first!=last){
    if(first->get().context!=context){
      change_context(context,first->get().context,out);
      context=first->get().context;
    }
    *out++=(first++)->get().code;
  }
  change_context(context,html_context(),out); /* close remaining context */
}

/* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
 * find some friend operators in certain contexts.
 */      

character dummy1;
html_tag  dummy2;

int main()
{
  std::cout<<"input html file: ";
  std::string in;
  std::getline(std::cin,in);
  std::ifstream ifs(in.c_str());
  if(!ifs){
    std::cout<<"can't open "<<in<<std::endl;
    std::exit(EXIT_FAILURE);
  }
  typedef std::istreambuf_iterator<char> istrbuf_iterator;
  std::vector<char> html_source;
  std::copy(
    istrbuf_iterator(ifs),istrbuf_iterator(),
    std::back_inserter(html_source));

  /* parse the HTML */
  
  std::vector<character> scanned_html;
  scan_html(
    html_source.begin(),html_source.end(),std::back_inserter(scanned_html));

  /* Now that we have the text as a vector of contextualized characters,
   * we can shuffle it around and manipulate in almost any way we please.
   * For instance, the following reverses the central portion of the doc.
   */

  std::reverse(
    scanned_html.begin()+scanned_html.size()/4,
    scanned_html.begin()+3*(scanned_html.size()/4));

  /* emit the resulting HTML */

  std::cout<<"output html file: ";
  std::string out;
  std::getline(std::cin,out);
  std::ofstream ofs(out.c_str());
  if(!ofs){
    std::cout<<"can't open "<<out<<std::endl;
    std::exit(EXIT_FAILURE);
  }
  typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
  produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));

  return 0;
}