linkcheck.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // The linkcheck command finds missing links in the godoc website.
  5. // It crawls a URL recursively and notes URLs and URL fragments
  6. // that it's seen and prints a report of missing links at the end.
  7. package main
  8. import (
  9. "errors"
  10. "flag"
  11. "fmt"
  12. "io"
  13. "log"
  14. "net/http"
  15. "os"
  16. "regexp"
  17. "strings"
  18. "sync"
  19. )
  20. var (
  21. root = flag.String("root", "http://localhost:6060", "Root to crawl")
  22. verbose = flag.Bool("verbose", false, "verbose")
  23. )
  24. var wg sync.WaitGroup // outstanding fetches
  25. var urlq = make(chan string) // URLs to crawl
  26. // urlFrag is a URL and its optional #fragment (without the #)
  27. type urlFrag struct {
  28. url, frag string
  29. }
  30. var (
  31. mu sync.Mutex
  32. crawled = make(map[string]bool) // URL without fragment -> true
  33. neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
  34. )
  35. var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
  36. // Owned by crawlLoop goroutine:
  37. var (
  38. linkSources = make(map[string][]string) // url no fragment -> sources
  39. fragExists = make(map[urlFrag]bool)
  40. problems []string
  41. )
  42. func localLinks(body string) (links []string) {
  43. seen := map[string]bool{}
  44. mv := aRx.FindAllStringSubmatch(body, -1)
  45. for _, m := range mv {
  46. ref := m[1]
  47. if strings.HasPrefix(ref, "/src/") {
  48. continue
  49. }
  50. if !seen[ref] {
  51. seen[ref] = true
  52. links = append(links, m[1])
  53. }
  54. }
  55. return
  56. }
  57. var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
  58. func pageIDs(body string) (ids []string) {
  59. mv := idRx.FindAllStringSubmatch(body, -1)
  60. for _, m := range mv {
  61. ids = append(ids, m[1])
  62. }
  63. return
  64. }
  65. // url may contain a #fragment, and the fragment is then noted as needing to exist.
  66. func crawl(url string, sourceURL string) {
  67. if strings.Contains(url, "/devel/release") {
  68. return
  69. }
  70. mu.Lock()
  71. defer mu.Unlock()
  72. if u, frag, ok := strings.Cut(url, "#"); ok {
  73. url = u
  74. if frag != "" {
  75. uf := urlFrag{url, frag}
  76. neededFrags[uf] = append(neededFrags[uf], sourceURL)
  77. }
  78. }
  79. if crawled[url] {
  80. return
  81. }
  82. crawled[url] = true
  83. wg.Add(1)
  84. go func() {
  85. urlq <- url
  86. }()
  87. }
  88. func addProblem(url, errmsg string) {
  89. msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
  90. if *verbose {
  91. log.Print(msg)
  92. }
  93. problems = append(problems, msg)
  94. }
  95. func crawlLoop() {
  96. for url := range urlq {
  97. if err := doCrawl(url); err != nil {
  98. addProblem(url, err.Error())
  99. }
  100. }
  101. }
  102. func doCrawl(url string) error {
  103. defer wg.Done()
  104. req, err := http.NewRequest("GET", url, nil)
  105. if err != nil {
  106. return err
  107. }
  108. res, err := http.DefaultTransport.RoundTrip(req)
  109. if err != nil {
  110. return err
  111. }
  112. // Handle redirects.
  113. if res.StatusCode/100 == 3 {
  114. newURL, err := res.Location()
  115. if err != nil {
  116. return fmt.Errorf("resolving redirect: %v", err)
  117. }
  118. if !strings.HasPrefix(newURL.String(), *root) {
  119. // Skip off-site redirects.
  120. return nil
  121. }
  122. crawl(newURL.String(), url)
  123. return nil
  124. }
  125. if res.StatusCode != 200 {
  126. return errors.New(res.Status)
  127. }
  128. slurp, err := io.ReadAll(res.Body)
  129. res.Body.Close()
  130. if err != nil {
  131. log.Fatalf("Error reading %s body: %v", url, err)
  132. }
  133. if *verbose {
  134. log.Printf("Len of %s: %d", url, len(slurp))
  135. }
  136. body := string(slurp)
  137. for _, ref := range localLinks(body) {
  138. if *verbose {
  139. log.Printf(" links to %s", ref)
  140. }
  141. dest := *root + ref
  142. linkSources[dest] = append(linkSources[dest], url)
  143. crawl(dest, url)
  144. }
  145. for _, id := range pageIDs(body) {
  146. if *verbose {
  147. log.Printf(" url %s has #%s", url, id)
  148. }
  149. fragExists[urlFrag{url, id}] = true
  150. }
  151. return nil
  152. }
  153. func main() {
  154. flag.Parse()
  155. go crawlLoop()
  156. crawl(*root, "")
  157. wg.Wait()
  158. close(urlq)
  159. for uf, needers := range neededFrags {
  160. if !fragExists[uf] {
  161. problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
  162. }
  163. }
  164. for _, s := range problems {
  165. fmt.Println(s)
  166. }
  167. if len(problems) > 0 {
  168. os.Exit(1)
  169. }
  170. }