diff options
Diffstat (limited to 'ext/tidy/README_TIDY')
| -rw-r--r-- | ext/tidy/README_TIDY | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/ext/tidy/README_TIDY b/ext/tidy/README_TIDY new file mode 100644 index 0000000000..9b15dcd102 --- /dev/null +++ b/ext/tidy/README_TIDY @@ -0,0 +1,154 @@ + +README FOR ext/tidy by John Coggeshall <john@php.net> + +Tidy Version: 0.5b + +Tidy is an extension based on Libtidy (http://tidy.sf.net/) and allows a PHP developer +to clean, repair, and traverse HTML, XHTML, and XML documents -- including ones with +embedded scripting languages such as PHP or ASP within them using OO constructs. + +The Tidy extension has two separate APIs, one for general parsing, cleaning, and +repairing and another for document traversal. The general API is provided below: + + tidy_create() Initialize and return a tidy document resource + tidy_parse_file($tidy, $file) Parse the document stored in $file + tidy_parse_string($tidy, $str) Parse the string stored in $str + + tidy_clean_repair($tidy) Clean and repair the document + tidy_diagnose($tidy) Diagnose a parsed document + + tidy_setopt($tidy, $opt, $val) Set a configuration option $opt to $val + tidy_getopt($tidy, $opt) Retrieve a configuration option + + ** note: $opt is a string representing the option. Right now the only + source of these options is the LibTidy source.. eventually I'll document + them offically -- see the src/config.c file in the tidy source ** + + tidy_get_output($tidy) Return the cleaned tidy HTML as a string + tidy_get_error_buffer($tidy) Return a log of the errors and warnings + returned by tidy + + tidy_get_release() Return the Libtidy release date + tidy_get_status($tidy) Return the status of the document + tidy_get_html_ver($tidy) Return the major HTML version detected for + the document; + + tidy_is_xhtml($tidy) Determines if the document is XHTML + tidy_is_xml($tidy) Determines if the document is a generic XML + + tidy_error_count($tidy) Returns the number of errors in the document + tidy_warning_count($tidy) Returns the number of warnings in the document + tidy_access_count($tidy) Returns the number of accessibility-related + warnings in the document. + tidy_config_count($tidy) Returns the number of configuration errors found + + tidy_load_config($tidy, $file) Loads the specified configuration file + tidY_load_config_enc($tidy, + $file, + $enc) Loads the specified config file using the specified + character encoding + tidy_set_encoding($tidy, $enc) Sets the current character encoding for the document + tidy_save_config($tidy, $file) Saves the current config to $file + + +Beyond these general-purpose API functions, Tidy also supports the following +functions which are used to retrieve an object for document traversal: + + tidy_get_root($tidy) Returns an object starting at the root of the + document + tidy_get_head($tidy) Returns an object starting at the <HEAD> tag + tidy_get_html($tidy) Returns an object starting at the <HTML> tag + tidy_get_body($tidy) Returns an object starting at the <BODY> tag + +All Navigation of the specified document is done via the PHP5 object constructs. +There are two types of objects which Tidy can create. The first is TidyNode, which +represents HTML Tags, Text, and more (see the TidyNode_Type Constants). The second +is TidyAttr, which represents an attribute within an HTML tag (TidyNode). The +functionality of these objects is represented by the following schema: + +class TidyNode { + + public $name; // name of node (i.e. HEAD) + public $value; // value of node (everything between tags) + public $type; // type of node (text, php, asp, etc.) + public $id; // id of node (i.e. TIDY_TAG_HEAD) + + public $line; // line # of node in source + public $column; // column # of node in source + + public $html_ver; // HTML version (0,1,2,3,4) + + public $attribs; // an array of attributes (see TidyAttr) + public $children; // an array of child nodes + + function has_siblings(); // any sibling nodes? + function has_children(); // any child nodes? + function has_parent(); // have a parent? + + function is_comment(); // is node a comment? + function is_xhtml(); // is document XHTML? + function is_xml(); // is document generic XML (not HTML/XHTML) + function is_text(); // is node text? + function is_html(); // is node an HTML tag? + + function is_jste(); // is jste block? + function is_asp(); // is Microsoft ASP block? + function is_php(); // is PHP block? + + function next(); // returns next node + function prev(); // returns prev node + function parent(); // returns parent node + function child(); // returns first child node + + /* Searches for a particular attribute in the current node based + on node ID. If found returns a TidyAttr object for it */ + function get_attr_type($attr_id); + + /* + + NOT YET IMPLEMENTED + + Recursively traverses the tree from the current node and returns + an array of attributes matching the node ID/attr ID pair + + Useful for pulling out things like links: + foreach($body->fetch_attrs(TIDY_TAG_A, TIDY_ATTR_HREF) as $link) { + echo "Link : {$link->value}\n"; + } + */ + + function fetch_attrs($node_id, $attr_id); + + /* + + NOT YET IMPLEMENTED + + Recursively traverses the tree from the current node and returns + an array of nodes matching the node ID + + Useful for pulling out tables, etc (echos the HTML for every + <TABLE> block) + + foreach($body->fetch_nodes(TIDY_TAG_TABLE) as $table) { + + echo $table->value; + + } + */ + function fetch_nodes($node_id) +} + +class TidyAttr { + + public $name; // attribute name i.e. HREF + public $value; // attribute value + public $id; // attribute id i.e. TIDY_ATTR_HREF + + function next(); // returns next attribute in tag + function tag(); // returns the tag node associated with attribute +} + +Examples of using these objects to navigate the tree can be found in the examples/ +directory (I suggest looking at urlgrab.php and dumpit.php) + +E-mail thoughts, suggestions, patches, etc. to <john@php.net>
\ No newline at end of file |
