123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- // Copyright 2013 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // The linkcheck command finds missing links in the godoc website.
- // It crawls a URL recursively and notes URLs and URL fragments
- // that it's seen and prints a report of missing links at the end.
- package main
- import (
- "errors"
- "flag"
- "fmt"
- "io"
- "log"
- "net/http"
- "os"
- "regexp"
- "strings"
- "sync"
- )
- var (
- root = flag.String("root", "http://localhost:6060", "Root to crawl")
- verbose = flag.Bool("verbose", false, "verbose")
- )
- var wg sync.WaitGroup // outstanding fetches
- var urlq = make(chan string) // URLs to crawl
- // urlFrag is a URL and its optional #fragment (without the #)
- type urlFrag struct {
- url, frag string
- }
- var (
- mu sync.Mutex
- crawled = make(map[string]bool) // URL without fragment -> true
- neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
- )
- var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
- // Owned by crawlLoop goroutine:
- var (
- linkSources = make(map[string][]string) // url no fragment -> sources
- fragExists = make(map[urlFrag]bool)
- problems []string
- )
- func localLinks(body string) (links []string) {
- seen := map[string]bool{}
- mv := aRx.FindAllStringSubmatch(body, -1)
- for _, m := range mv {
- ref := m[1]
- if strings.HasPrefix(ref, "/src/") {
- continue
- }
- if !seen[ref] {
- seen[ref] = true
- links = append(links, m[1])
- }
- }
- return
- }
- var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
- func pageIDs(body string) (ids []string) {
- mv := idRx.FindAllStringSubmatch(body, -1)
- for _, m := range mv {
- ids = append(ids, m[1])
- }
- return
- }
- // url may contain a #fragment, and the fragment is then noted as needing to exist.
- func crawl(url string, sourceURL string) {
- if strings.Contains(url, "/devel/release") {
- return
- }
- mu.Lock()
- defer mu.Unlock()
- if u, frag, ok := strings.Cut(url, "#"); ok {
- url = u
- if frag != "" {
- uf := urlFrag{url, frag}
- neededFrags[uf] = append(neededFrags[uf], sourceURL)
- }
- }
- if crawled[url] {
- return
- }
- crawled[url] = true
- wg.Add(1)
- go func() {
- urlq <- url
- }()
- }
- func addProblem(url, errmsg string) {
- msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
- if *verbose {
- log.Print(msg)
- }
- problems = append(problems, msg)
- }
- func crawlLoop() {
- for url := range urlq {
- if err := doCrawl(url); err != nil {
- addProblem(url, err.Error())
- }
- }
- }
- func doCrawl(url string) error {
- defer wg.Done()
- req, err := http.NewRequest("GET", url, nil)
- if err != nil {
- return err
- }
- res, err := http.DefaultTransport.RoundTrip(req)
- if err != nil {
- return err
- }
- // Handle redirects.
- if res.StatusCode/100 == 3 {
- newURL, err := res.Location()
- if err != nil {
- return fmt.Errorf("resolving redirect: %v", err)
- }
- if !strings.HasPrefix(newURL.String(), *root) {
- // Skip off-site redirects.
- return nil
- }
- crawl(newURL.String(), url)
- return nil
- }
- if res.StatusCode != 200 {
- return errors.New(res.Status)
- }
- slurp, err := io.ReadAll(res.Body)
- res.Body.Close()
- if err != nil {
- log.Fatalf("Error reading %s body: %v", url, err)
- }
- if *verbose {
- log.Printf("Len of %s: %d", url, len(slurp))
- }
- body := string(slurp)
- for _, ref := range localLinks(body) {
- if *verbose {
- log.Printf(" links to %s", ref)
- }
- dest := *root + ref
- linkSources[dest] = append(linkSources[dest], url)
- crawl(dest, url)
- }
- for _, id := range pageIDs(body) {
- if *verbose {
- log.Printf(" url %s has #%s", url, id)
- }
- fragExists[urlFrag{url, id}] = true
- }
- return nil
- }
- func main() {
- flag.Parse()
- go crawlLoop()
- crawl(*root, "")
- wg.Wait()
- close(urlq)
- for uf, needers := range neededFrags {
- if !fragExists[uf] {
- problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
- }
- }
- for _, s := range problems {
- fmt.Println(s)
- }
- if len(problems) > 0 {
- os.Exit(1)
- }
- }
|