linkchecker tool now visits the URL to determine if it's valid

This commit is contained in:
Chao Xu
2016-01-09 17:03:39 -08:00
parent 3328cfe1c5
commit 3ec4cd423e
8 changed files with 157 additions and 51 deletions

View File

@@ -14,56 +14,155 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
// This command checks if the hyperlinks in files are valid. It checks the files
// with 'fileSuffix' in 'rootDir' for URLs that match 'prefix'. It trims the
// 'prefix' from the URL, uses what's left as the relative path to repoRoot to
// verify if the link is valid. For example:
// $ linkcheck --root-dir=${TYPEROOT} --repo-root=${KUBE_ROOT} \
// --file-suffix=types.go --prefix=http://releases.k8s.io/HEAD
// This tool extracts the links from types.go and .md files, visits the link and
// checks the status code of the response.
// Usage:
// $ linkcheck --root-dir=${ROOT}
package main
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
"github.com/mvdan/xurls"
flag "github.com/spf13/pflag"
)
var (
httpRE *regexp.Regexp
rootDir = flag.String("root-dir", "", "Root directory containing documents to be processed.")
repoRoot = flag.String("repo-root", "", `Root directory of k8s repository.`)
fileSuffix = flag.String("file-suffix", "", "suffix of files to be checked")
prefix = flag.String("prefix", "", "Longest common prefix of the link URL, e.g., http://release.k8s.io/HEAD/ for links in pkg/api/types.go")
fileSuffix = flag.StringSlice("file-suffix", []string{"types.go", ".md"}, "suffix of files to be checked")
// URLs matching the patterns in the regWhiteList won't be checked. Patterns
// of dummy URLs should be added to the list to avoid false alerts. Also,
// patterns of URLs that we don't care about can be added here to improve
// efficiency.
regWhiteList = []*regexp.Regexp{
regexp.MustCompile(`https://kubernetes-site\.appspot\.com`),
// skip url that doesn't start with an English alphabet, e.g., URLs with IP addresses.
regexp.MustCompile(`https?://[^A-Za-z].*`),
regexp.MustCompile(`https?://localhost.*`),
}
// URLs listed in the fullURLWhiteList won't be checked. This separated from
// the RegWhiteList to improve efficiency. This list includes dummy URLs that
// are hard to be generalized by a regex, and URLs that will cause false alerts.
fullURLWhiteList = map[string]struct{}{
"http://github.com/some/repo.git": {},
// This URL returns 404 when visited by this tool, but it works fine if visited by a browser.
"http://stackoverflow.com/questions/ask?tags=kubernetes": {},
"https://github.com/$YOUR_GITHUB_USERNAME/kubernetes.git": {},
"https://github.com/$YOUR_GITHUB_USERNAME/kubernetes": {},
"http://storage.googleapis.com/kubernetes-release/release/v${K8S_VERSION}/bin/darwin/amd64/kubectl": {},
// It seems this server expects certain User-Agent value, it works fine with Chrome, but returns 404 if we issue a plain cURL to it.
"http://supervisord.org/": {},
"http://kubernetes.io/vX.Y/docs": {},
"http://kubernetes.io/vX.Y/docs/": {},
"http://kubernetes.io/vX.Y/": {},
}
visitedURLs = map[string]struct{}{}
htmlpreviewReg = regexp.MustCompile(`https://htmlpreview\.github\.io/\?`)
httpOrhttpsReg = regexp.MustCompile(`https?.*`)
)
func newWalkFunc(invalidLink *bool) filepath.WalkFunc {
func newWalkFunc(invalidLink *bool, client *http.Client) filepath.WalkFunc {
return func(filePath string, info os.FileInfo, err error) error {
if !strings.HasSuffix(info.Name(), *fileSuffix) {
hasSuffix := false
for _, suffix := range *fileSuffix {
hasSuffix = hasSuffix || strings.HasSuffix(info.Name(), suffix)
}
if !hasSuffix {
return nil
}
fileBytes, err := ioutil.ReadFile(filePath)
if err != nil {
return err
}
foundInvalid := false
matches := httpRE.FindAllSubmatch(fileBytes, -1)
for _, match := range matches {
// match[1] should look like docs/devel/api-conventions.md
if _, err := os.Stat(path.Join(*repoRoot, string(match[1]))); err != nil {
fmt.Fprintf(os.Stderr, "Link is not valid: %s\n", string(match[0]))
allURLs := xurls.Strict.FindAll(fileBytes, -1)
fmt.Fprintf(os.Stdout, "\nChecking file %s\n", filePath)
URL:
for _, URL := range allURLs {
// Don't check non http/https URL
if !httpOrhttpsReg.Match(URL) {
continue
}
for _, whiteURL := range regWhiteList {
if whiteURL.Match(URL) {
continue URL
}
}
if _, found := fullURLWhiteList[string(URL)]; found {
continue
}
// remove the htmlpreview Prefix
processedURL := htmlpreviewReg.ReplaceAll(URL, []byte{})
// check if we have visited the URL.
if _, found := visitedURLs[string(processedURL)]; found {
continue
}
visitedURLs[string(processedURL)] = struct{}{}
retry := 0
const maxRetry int = 3
backoff := 100
for retry < maxRetry {
fmt.Fprintf(os.Stdout, "Visiting %s\n", string(processedURL))
// Use verb HEAD to increase efficiency. However, some servers
// do not handle HEAD well, so we need to try a GET to avoid
// false alert.
resp, err := client.Head(string(processedURL))
// URLs with mock host or mock port will cause error. If we report
// the error here, people need to add the mock URL to the white
// list every time they add a mock URL, which will be a maintenance
// nightmare. Hence, we decide to only report 404 to catch the
// cases where host and port are legit, but path is not, which
// is the most common mistake in our docs.
if err != nil {
break
}
if resp.StatusCode == 429 {
retryAfter := resp.Header.Get("Retry-After")
if seconds, err := strconv.Atoi(retryAfter); err != nil {
backoff = seconds + 10
}
fmt.Fprintf(os.Stderr, "Got %d visiting %s, retry after %d seconds.\n", resp.StatusCode, string(URL), backoff)
time.Sleep(time.Duration(backoff) * time.Second)
backoff *= 2
retry++
} else if resp.StatusCode == 404 {
// We only check for 404 error for now. 401, 403 errors are hard to handle.
// We need to try a GET to avoid false alert.
resp, err = client.Get(string(processedURL))
if err != nil {
break
}
if resp.StatusCode != 404 {
continue URL
}
foundInvalid = true
fmt.Fprintf(os.Stderr, "Failed: in file %s, Got %d visiting %s\n", filePath, resp.StatusCode, string(URL))
break
} else {
break
}
}
if retry == maxRetry {
foundInvalid = true
fmt.Fprintf(os.Stderr, "Failed: in file %s, still got 429 visiting %s after %d retries\n", filePath, string(URL), maxRetry)
}
}
if foundInvalid {
fmt.Fprintf(os.Stderr, "Found invalid links in %s\n", filePath)
*invalidLink = true
}
return nil
@@ -72,14 +171,16 @@ func newWalkFunc(invalidLink *bool) filepath.WalkFunc {
func main() {
flag.Parse()
httpRE = regexp.MustCompile(*prefix + `(.*\.md)`)
if *rootDir == "" || *repoRoot == "" || *prefix == "" {
if *rootDir == "" {
flag.Usage()
os.Exit(2)
}
client := http.Client{
Timeout: time.Duration(5 * time.Second),
}
invalidLink := false
if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink)); err != nil {
if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink, &client)); err != nil {
fmt.Fprintf(os.Stderr, "Fail: %v.\n", err)
os.Exit(2)
}