ext/tidy/examples/urlgrab.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

<?php

    /*
     * urlgrab.php
     *
     * A simple command-line utility to extract all of the URLS contained
     * within <A HREF> tags from a document.
     *
     * By: John Coggeshall <john@php.net>
     *
     * Usage: php urlgrab.php <file>
     *
     */
     
    /* Create a Tidy Resource */
    $tidy = tidy_create();
    
    /* Parse the document */
    tidy_parse_file($tidy, $_SERVER['argv'][1]);
    
    /* Fix up the document */
    tidy_clean_repair($tidy);
    
    /* Get an object representing everything from the <HTML> tag in */
    $html = tidy_get_html($tidy);
    
    /* Traverse the document tree */
    print_r(get_links($html));
    
    function get_links($node) {
        $urls = array();
        
        /* Check to see if we are on an <A> tag or not */
        if($node->id == TIDY_TAG_A) {
            /* If we are, find the HREF attribute */
            $attrib = $node->get_attr_type(TIDY_ATTR_HREF);
            if($attrib) {
                /* Add the value of the HREF attrib to $urls */
                $urls[] = $attrib->value;
            }
            
        }
        
        /* Are there any children? */
        if($node->has_children()) {
            
            /* Traverse down each child recursively */
            foreach($node->children as $child) {
                   
                /* Append the results from recursion to $urls */
                foreach(get_links($child) as $url) {
                    
                    $urls[] = $url;
                    
                }
                
            }
        }
        
        return $urls;
    }
    
?>