From 329aa0cdcdbbf852c5b01c33046faa4cc2d4ba59 Mon Sep 17 00:00:00 2001 From: Andrew Gerrand Date: Fri, 25 Oct 2013 17:31:02 +0300 Subject: misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://codereview.appspot.com/14425049 --- misc/linkcheck/linkcheck.go | 90 +++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 32 deletions(-) (limited to 'misc/linkcheck') diff --git a/misc/linkcheck/linkcheck.go b/misc/linkcheck/linkcheck.go index 01e9879a1..d9bfd2f76 100644 --- a/misc/linkcheck/linkcheck.go +++ b/misc/linkcheck/linkcheck.go @@ -8,11 +8,13 @@ package main import ( + "errors" "flag" "fmt" "io/ioutil" "log" "net/http" + "os" "regexp" "strings" "sync" @@ -101,49 +103,71 @@ func crawl(url string, sourceURL string) { func addProblem(url, errmsg string) { msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) - log.Print(msg) + if *verbose { + log.Print(msg) + } problems = append(problems, msg) } func crawlLoop() { for url := range urlq { - res, err := http.Get(url) - if err != nil { - addProblem(url, fmt.Sprintf("Error fetching: %v", err)) - wg.Done() - continue + if err := doCrawl(url); err != nil { + addProblem(url, err.Error()) } - if res.StatusCode != 200 { - addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode)) - wg.Done() - continue - } - slurp, err := ioutil.ReadAll(res.Body) - res.Body.Close() + } +} + +func doCrawl(url string) error { + defer wg.Done() + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return err + } + res, err := http.DefaultTransport.RoundTrip(req) + if err != nil { + return err + } + // Handle redirects. + if res.StatusCode/100 == 3 { + newURL, err := res.Location() if err != nil { - log.Fatalf("Error reading %s body: %v", url, err) + return fmt.Errorf("resolving redirect: %v", err) } - if *verbose { - log.Printf("Len of %s: %d", url, len(slurp)) + if !strings.HasPrefix(newURL.String(), *root) { + // Skip off-site redirects. + return nil } - body := string(slurp) - for _, ref := range localLinks(body) { - if *verbose { - log.Printf(" links to %s", ref) - } - dest := *root + ref - linkSources[dest] = append(linkSources[dest], url) - crawl(dest, url) + crawl(newURL.String(), url) + return nil + } + if res.StatusCode != 200 { + return errors.New(res.Status) + } + slurp, err := ioutil.ReadAll(res.Body) + res.Body.Close() + if err != nil { + log.Fatalf("Error reading %s body: %v", url, err) + } + if *verbose { + log.Printf("Len of %s: %d", url, len(slurp)) + } + body := string(slurp) + for _, ref := range localLinks(body) { + if *verbose { + log.Printf(" links to %s", ref) } - for _, id := range pageIDs(body) { - if *verbose { - log.Printf(" url %s has #%s", url, id) - } - fragExists[urlFrag{url, id}] = true + dest := *root + ref + linkSources[dest] = append(linkSources[dest], url) + crawl(dest, url) + } + for _, id := range pageIDs(body) { + if *verbose { + log.Printf(" url %s has #%s", url, id) } - - wg.Done() + fragExists[urlFrag{url, id}] = true } + return nil } func main() { @@ -151,7 +175,6 @@ func main() { go crawlLoop() crawl(*root, "") - crawl(*root+"/doc/go1.1.html", "") wg.Wait() close(urlq) @@ -164,4 +187,7 @@ func main() { for _, s := range problems { fmt.Println(s) } + if len(problems) > 0 { + os.Exit(1) + } } -- cgit v1.2.1