linkchecker tool now visits the URL to determine if it's valid

2026-01-05 15:37:24 +00:00 · 2016-01-09 17:03:39 -08:00
parent 3328cfe1c5
commit 3ec4cd423e
8 changed files with 157 additions and 51 deletions
--- a/cmd/linkcheck/links.go
+++ b/cmd/linkcheck/links.go
@@ -14,56 +14,155 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-// This command checks if the hyperlinks in files are valid. It checks the files
-// with 'fileSuffix' in 'rootDir' for URLs that match 'prefix'. It trims the
-// 'prefix' from the URL, uses what's left as the relative path to repoRoot to
-// verify if the link is valid. For example:
-// $ linkcheck --root-dir=${TYPEROOT} --repo-root=${KUBE_ROOT} \
-// --file-suffix=types.go --prefix=http://releases.k8s.io/HEAD
+// This tool extracts the links from types.go and .md files, visits the link and
+// checks the status code of the response.
+// Usage:
+// $ linkcheck --root-dir=${ROOT}

 package main

 import (
 	"fmt"
 	"io/ioutil"
+	"net/http"
 	"os"
-	"path"
 	"path/filepath"
 	"regexp"
+	"strconv"
 	"strings"
+	"time"

+	"github.com/mvdan/xurls"
 	flag "github.com/spf13/pflag"
 )

 var (
-	httpRE *regexp.Regexp
-
 	rootDir    = flag.String("root-dir", "", "Root directory containing documents to be processed.")
-	repoRoot   = flag.String("repo-root", "", `Root directory of k8s repository.`)
-	fileSuffix = flag.String("file-suffix", "", "suffix of files to be checked")
-	prefix     = flag.String("prefix", "", "Longest common prefix of the link URL, e.g., http://release.k8s.io/HEAD/ for links in pkg/api/types.go")
+	fileSuffix = flag.StringSlice("file-suffix", []string{"types.go", ".md"}, "suffix of files to be checked")
+	// URLs matching the patterns in the regWhiteList won't be checked. Patterns
+	// of dummy URLs should be added to the list to avoid false alerts. Also,
+	// patterns of URLs that we don't care about can be added here to improve
+	// efficiency.
+	regWhiteList = []*regexp.Regexp{
+		regexp.MustCompile(`https://kubernetes-site\.appspot\.com`),
+		// skip url that doesn't start with an English alphabet, e.g., URLs with IP addresses.
+		regexp.MustCompile(`https?://[^A-Za-z].*`),
+		regexp.MustCompile(`https?://localhost.*`),
+	}
+	// URLs listed in the fullURLWhiteList won't be checked. This separated from
+	// the RegWhiteList to improve efficiency. This list includes dummy URLs that
+	// are hard to be generalized by a regex, and URLs that will cause false alerts.
+	fullURLWhiteList = map[string]struct{}{
+		"http://github.com/some/repo.git": {},
+		// This URL returns 404 when visited by this tool, but it works fine if visited by a browser.
+		"http://stackoverflow.com/questions/ask?tags=kubernetes":                                            {},
+		"https://github.com/$YOUR_GITHUB_USERNAME/kubernetes.git":                                           {},
+		"https://github.com/$YOUR_GITHUB_USERNAME/kubernetes":                                               {},
+		"http://storage.googleapis.com/kubernetes-release/release/v${K8S_VERSION}/bin/darwin/amd64/kubectl": {},
+		// It seems this server expects certain User-Agent value, it works fine with Chrome, but returns 404 if we issue a plain cURL to it.
+		"http://supervisord.org/":         {},
+		"http://kubernetes.io/vX.Y/docs":  {},
+		"http://kubernetes.io/vX.Y/docs/": {},
+		"http://kubernetes.io/vX.Y/":      {},
+	}
+
+	visitedURLs    = map[string]struct{}{}
+	htmlpreviewReg = regexp.MustCompile(`https://htmlpreview\.github\.io/\?`)
+	httpOrhttpsReg = regexp.MustCompile(`https?.*`)
 )

-func newWalkFunc(invalidLink *bool) filepath.WalkFunc {
+func newWalkFunc(invalidLink *bool, client *http.Client) filepath.WalkFunc {
 	return func(filePath string, info os.FileInfo, err error) error {
-		if !strings.HasSuffix(info.Name(), *fileSuffix) {
+		hasSuffix := false
+		for _, suffix := range *fileSuffix {
+			hasSuffix = hasSuffix || strings.HasSuffix(info.Name(), suffix)
+		}
+		if !hasSuffix {
 			return nil
 		}
+
 		fileBytes, err := ioutil.ReadFile(filePath)
 		if err != nil {
 			return err
 		}
 		foundInvalid := false
-		matches := httpRE.FindAllSubmatch(fileBytes, -1)
-		for _, match := range matches {
-			// match[1] should look like docs/devel/api-conventions.md
-			if _, err := os.Stat(path.Join(*repoRoot, string(match[1]))); err != nil {
-				fmt.Fprintf(os.Stderr, "Link is not valid: %s\n", string(match[0]))
+		allURLs := xurls.Strict.FindAll(fileBytes, -1)
+		fmt.Fprintf(os.Stdout, "\nChecking file %s\n", filePath)
+	URL:
+		for _, URL := range allURLs {
+			// Don't check non http/https URL
+			if !httpOrhttpsReg.Match(URL) {
+				continue
+			}
+			for _, whiteURL := range regWhiteList {
+				if whiteURL.Match(URL) {
+					continue URL
+				}
+			}
+			if _, found := fullURLWhiteList[string(URL)]; found {
+				continue
+			}
+			// remove the htmlpreview Prefix
+			processedURL := htmlpreviewReg.ReplaceAll(URL, []byte{})
+
+			// check if we have visited the URL.
+			if _, found := visitedURLs[string(processedURL)]; found {
+				continue
+			}
+			visitedURLs[string(processedURL)] = struct{}{}
+
+			retry := 0
+			const maxRetry int = 3
+			backoff := 100
+			for retry < maxRetry {
+				fmt.Fprintf(os.Stdout, "Visiting %s\n", string(processedURL))
+				// Use verb HEAD to increase efficiency. However, some servers
+				// do not handle HEAD well, so we need to try a GET to avoid
+				// false alert.
+				resp, err := client.Head(string(processedURL))
+				// URLs with mock host or mock port will cause error. If we report
+				// the error here, people need to add the mock URL to the white
+				// list every time they add a mock URL, which will be a maintenance
+				// nightmare. Hence, we decide to only report 404 to catch the
+				// cases where host and port are legit, but path is not, which
+				// is the most common mistake in our docs.
+				if err != nil {
+					break
+				}
+				if resp.StatusCode == 429 {
+					retryAfter := resp.Header.Get("Retry-After")
+					if seconds, err := strconv.Atoi(retryAfter); err != nil {
+						backoff = seconds + 10
+					}
+					fmt.Fprintf(os.Stderr, "Got %d visiting %s, retry after %d seconds.\n", resp.StatusCode, string(URL), backoff)
+					time.Sleep(time.Duration(backoff) * time.Second)
+					backoff *= 2
+					retry++
+				} else if resp.StatusCode == 404 {
+					// We only check for 404 error for now. 401, 403 errors are hard to handle.
+
+					// We need to try a GET to avoid false alert.
+					resp, err = client.Get(string(processedURL))
+					if err != nil {
+						break
+					}
+					if resp.StatusCode != 404 {
+						continue URL
+					}
+
+					foundInvalid = true
+					fmt.Fprintf(os.Stderr, "Failed: in file %s, Got %d visiting %s\n", filePath, resp.StatusCode, string(URL))
+					break
+				} else {
+					break
+				}
+			}
+			if retry == maxRetry {
 				foundInvalid = true
+				fmt.Fprintf(os.Stderr, "Failed: in file %s, still got 429 visiting %s after %d retries\n", filePath, string(URL), maxRetry)
 			}
 		}
 		if foundInvalid {
-			fmt.Fprintf(os.Stderr, "Found invalid links in %s\n", filePath)
 			*invalidLink = true
 		}
 		return nil
@@ -72,14 +171,16 @@ func newWalkFunc(invalidLink *bool) filepath.WalkFunc {

 func main() {
 	flag.Parse()
-	httpRE = regexp.MustCompile(*prefix + `(.*\.md)`)

-	if *rootDir == "" || *repoRoot == "" || *prefix == "" {
+	if *rootDir == "" {
 		flag.Usage()
 		os.Exit(2)
 	}
+	client := http.Client{
+		Timeout: time.Duration(5 * time.Second),
+	}
 	invalidLink := false
-	if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink)); err != nil {
+	if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink, &client)); err != nil {
 		fmt.Fprintf(os.Stderr, "Fail: %v.\n", err)
 		os.Exit(2)
 	}