summaryrefslogtreecommitdiff
path: root/ext/tidy/examples
diff options
context:
space:
mode:
Diffstat (limited to 'ext/tidy/examples')
-rw-r--r--ext/tidy/examples/cleanhtml.php40
-rw-r--r--ext/tidy/examples/dumpit.php94
-rw-r--r--ext/tidy/examples/urlgrab.php63
3 files changed, 197 insertions, 0 deletions
diff --git a/ext/tidy/examples/cleanhtml.php b/ext/tidy/examples/cleanhtml.php
new file mode 100644
index 0000000000..c949a0cfc2
--- /dev/null
+++ b/ext/tidy/examples/cleanhtml.php
@@ -0,0 +1,40 @@
+<?php
+
+ /*
+ * cleanhtml.php
+ *
+ * A simple script to clean and repair HTML,XHTML,PHP,ASP,etc. documents
+ * if no file is provided, it reads from standard input.
+ *
+ * By: John Coggeshall <john@php.net>
+ *
+ * Usage: php cleanhtml.php [filename]
+ *
+ */
+
+ $tidy = tidy_create();
+
+ if(!isset($_SERVER['argv'][1])) {
+ $data = file_get_contents("php://stdin");
+ tidy_parse_string($tidy, $data);
+ } else {
+ tidy_parse_file($tidy, $_SERVER['argv'][1]);
+ }
+
+ tidy_clean_repair($tidy);
+
+ if(tidy_warning_count($tidy) ||
+ tidy_error_count($tidy)) {
+
+ echo "\n\nThe following errors or warnings occured:\n";
+ echo tidy_get_error_buffer($tidy);
+ echo "\n";
+ }
+
+ echo tidy_get_output($tidy);
+
+?>
+
+
+
+ \ No newline at end of file
diff --git a/ext/tidy/examples/dumpit.php b/ext/tidy/examples/dumpit.php
new file mode 100644
index 0000000000..46d307d704
--- /dev/null
+++ b/ext/tidy/examples/dumpit.php
@@ -0,0 +1,94 @@
+<?php
+ /*
+ * dumpit.php
+ *
+ * a command-line script which dumps the given HTML, PHP, ASP, XHTML, etc.
+ * file as it is represented in the document model.
+ *
+ * By: John Coggeshall <john@php.net>
+ *
+ * Usage; php dumpit.php <filename>
+ */
+
+
+ $tidy = tidy_create();
+ tidy_parse_file($tidy, $_SERVER['argv'][1]);
+
+ /* Optionally you can do this here if you want to fix up the document */
+
+ /* tidy_clean_repair($tidy); */
+
+ $tree = tidy_get_root($tidy);
+ dump_tree($tree);
+ echo "\n";
+
+ function node_type($type) {
+
+ switch($type) {
+
+ case TIDY_NODETYPE_ROOT: return "Root Node";
+ case TIDY_NODETYPE_DOCTYPE: return "DocType Node";
+ case TIDY_NODETYPE_COMMENT: return "Comment Node";
+ case TIDY_NODETYPE_PROCINS: return "ProcIns Node";
+ case TIDY_NODETYPE_TEXT: return "Text Node";
+ case TIDY_NODETYPE_START: return "Start Node";
+ case TIDY_NODETYPE_END: return "End Node";
+ case TIDY_NODETYPE_STARTEND: return "Start/End Node";
+ case TIDY_NODETYPE_CDATA: return "CDATA Node";
+ case TIDY_NODETYPE_SECTION: return "Section Node";
+ case TIDY_NODETYPE_ASP: return "ASP Source Code Node";
+ case TIDY_NODETYPE_PHP: return "PHP Source Code Node";
+ case TIDY_NODETYPE_JSTE: return "JSTE Source Code";
+ case TIDY_NODETYPE_XMLDECL: return "XML Declaration Node";
+ default: return "Unknown Node";
+ }
+ }
+
+ function do_leaf($string, $indent) {
+ for($i = 0; $i < $indent; $i++) {
+ echo " ";
+ }
+ echo $string;
+ }
+
+ function dump_tree($node, $indent = 0) {
+ if($node) {
+ /* Put something there if the node name is empty */
+ $nodename = trim(strtoupper($node->name));
+ $nodename = (empty($nodename)) ? "[EMPTY]" : $nodename;
+
+ /* Generate the Node, and a pretty name for it */
+ do_leaf(" + $nodename (".node_type($node->type).")\n", $indent);
+
+ /* Check to see if this node is a text node. Text nodes are
+ generated by start/end tags and contain the text in between.
+ i.e. <B>foo</B> will create a text node with $node->value
+ equal to 'foo' */
+ if($node->type == TIDY_NODETYPE_TEXT) {
+ do_leaf(" |\n", $indent);
+ do_leaf(" +---- Value: '{$node->value}'\n", $indent);
+ }
+
+ /* Any attributes on this node? */
+ if(count($node->attribs)) {
+ do_leaf(" |\n", $indent);
+ do_leaf(" +---- Attributes\n", $indent);
+
+ /* Cycle through the attributes and display them and their values. */
+ foreach($node->attribs as $attrib) {
+ do_leaf(" +--{$attrib->name}\n", $indent);
+ do_leaf(" | +-- Value: {$attrib->value}\n", $indent);
+ }
+ }
+
+ /* Recurse along the children to generate the remaining nodes */
+ if($node->has_children()) {
+ foreach($node->children as $child) {
+ dump_tree($child, $indent + 3);
+ }
+ }
+ }
+ }
+
+
+?> \ No newline at end of file
diff --git a/ext/tidy/examples/urlgrab.php b/ext/tidy/examples/urlgrab.php
new file mode 100644
index 0000000000..63a2875a79
--- /dev/null
+++ b/ext/tidy/examples/urlgrab.php
@@ -0,0 +1,63 @@
+<?php
+
+ /*
+ * urlgrab.php
+ *
+ * A simple command-line utility to extract all of the URLS contained
+ * within <A HREF> tags from a document.
+ *
+ * By: John Coggeshall <john@php.net>
+ *
+ * Usage: php urlgrab.php <file>
+ *
+ */
+
+ /* Create a Tidy Resource */
+ $tidy = tidy_create();
+
+ /* Parse the document */
+ tidy_parse_file($tidy, $_SERVER['argv'][1]);
+
+ /* Fix up the document */
+ tidy_clean_repair($tidy);
+
+ /* Get an object representing everything from the <HTML> tag in */
+ $html = tidy_get_html($tidy);
+
+ /* Traverse the document tree */
+ print_r(get_links($html));
+
+ function get_links($node) {
+ $urls = array();
+
+ /* Check to see if we are on an <A> tag or not */
+ if($node->id == TIDY_TAG_A) {
+ /* If we are, find the HREF attribute */
+ $attrib = $node->get_attr_type(TIDY_ATTR_HREF);
+ if($attrib) {
+ /* Add the value of the HREF attrib to $urls */
+ $urls[] = $attrib->value;
+ }
+
+ }
+
+ /* Are there any children? */
+ if($node->has_children()) {
+
+ /* Traverse down each child recursively */
+ foreach($node->children as $child) {
+
+ /* Append the results from recursion to $urls */
+ foreach(get_links($child) as $url) {
+
+ $urls[] = $url;
+
+ }
+
+ }
+ }
+
+ return $urls;
+ }
+
+?> \ No newline at end of file