mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-28 03:42:09 +00:00
Some copyright dates were not updated with the most recent changes to code; update them. Fixes #8595 Signed-off-by: Chelsea Mafrica <chelsea.e.mafrica@intel.com>
337 lines
8.6 KiB
Bash
Executable File
337 lines
8.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright (c) 2019-2023 Intel Corporation
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Description: spell-check utility.
|
|
|
|
[ -n "$DEBUG" ] && set -x
|
|
|
|
set -o errexit
|
|
set -o pipefail
|
|
set -o nounset
|
|
|
|
# Ensure we spell check in English
|
|
LANG=C
|
|
LC_ALL=C
|
|
|
|
script_name=${0##*/}
|
|
|
|
if [ "$(uname -s)" == "Darwin" ]
|
|
then
|
|
# Hunspell dictionaries are a not easily available
|
|
# on this platform it seems.
|
|
echo "INFO: $script_name: OSX not supported - exiting"
|
|
exit 0
|
|
fi
|
|
|
|
self_dir=$(dirname "$(readlink -f "$0")")
|
|
cidir="${self_dir}/../../../tests"
|
|
|
|
source "${cidir}/common.bash"
|
|
|
|
# Directory containing word lists.
|
|
#
|
|
# Each file in this directory must:
|
|
#
|
|
# - Have the ".txt" extension.
|
|
# - Contain one word per line.
|
|
#
|
|
# Additionally, the files may contain blank lines and comments
|
|
# (lines beginning with '#').
|
|
KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data}
|
|
|
|
KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}"
|
|
|
|
# Name of dictionary file suitable for using with hunspell(1)
|
|
# as a personal dictionary.
|
|
KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}"
|
|
|
|
KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}"
|
|
|
|
# command to remove code from markdown (inline and blocks)
|
|
strip_cmd="${cidir}/kata-doc-to-script.sh"
|
|
|
|
fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}"
|
|
|
|
# Name of file containing dictionary rules that apply to the
|
|
# KATA_DICT_FILE word list.
|
|
rules_file_name="rules.aff"
|
|
|
|
# Command to spell check a file
|
|
spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}"
|
|
|
|
# Command to convert a markdown file into plain text
|
|
md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}"
|
|
|
|
KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}"
|
|
dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}"
|
|
rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}"
|
|
|
|
# Hunspell refers to custom dictionary by their path followed by the name of
|
|
# the dictionary (without the file extension).
|
|
kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}"
|
|
|
|
# All project documentation must be written in English,
|
|
# with American English taking priority.
|
|
#
|
|
# We also use a custom dictionary which has to be specified by its
|
|
# "directory and name prefix" and which must also be the first specified
|
|
# dictionary.
|
|
dict_languages="${kata_dict_ref},en_US,en_GB"
|
|
|
|
make_dictionary()
|
|
{
|
|
[ -d "$fragment_dir" ] || die "invalid fragment directory"
|
|
[ -z "$dict_file" ] && die "missing dictionary output file name"
|
|
|
|
# Note: the first field is extracted to allow for inline
|
|
# comments in each fragment. For example:
|
|
#
|
|
# word # this text describes why the word is in the dictionary.
|
|
#
|
|
local dict
|
|
|
|
dict=$(cat "$fragment_dir"/*.txt |\
|
|
grep -v '^\#' |\
|
|
grep -v '^$' |\
|
|
awk '{print $1}' |\
|
|
sort -u || true)
|
|
|
|
[ -z "$dict" ] && die "generated dictionary is empty"
|
|
|
|
# Now, add in the number of words as a header (required by Hunspell)
|
|
local count
|
|
|
|
count=$(echo "$dict"| wc -l | awk '{print $1}' || true)
|
|
[ -z "$count" ] && die "cannot determine dictionary length"
|
|
[ "$count" -eq 0 ] && die "invalid dictionary length"
|
|
|
|
# Construct the dictionary
|
|
(echo "$count"; echo "$dict") > "$dict_file"
|
|
|
|
cp "${fragment_dir}/${rules_file_name}" "${rules_file}"
|
|
}
|
|
|
|
spell_check_file()
|
|
{
|
|
local file="$1"
|
|
|
|
[ -z "$file" ] && die "need file to check"
|
|
[ -e "$file" ] || die "file does not exist: '$file'"
|
|
|
|
[ -e "$dict_file" ] || make_dictionary
|
|
|
|
info "Spell checking file '$file'"
|
|
|
|
# Determine the pandoc input format.
|
|
local pandoc_input_fmts
|
|
local pandoc_input_fmt
|
|
|
|
local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true)
|
|
|
|
if [ -z "$pandoc_input_fmts" ]
|
|
then
|
|
# We're using a very old version of pandoc that doesn't
|
|
# support listing its available input formats, so
|
|
# specify a default.
|
|
pandoc_input_fmt="markdown_github"
|
|
else
|
|
# Pandoc has multiple names for the gfm parser so find one of them
|
|
pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\
|
|
grep -E "gfm|github" |\
|
|
head -1 || true)
|
|
fi
|
|
|
|
[ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format"
|
|
|
|
local stripped_doc
|
|
|
|
local pandoc_doc
|
|
local utf8_free_doc
|
|
local pre_hunspell_doc
|
|
local hunspell_results
|
|
local final_results
|
|
|
|
# First strip out all code blocks and convert all
|
|
# "quoted apostrophe's" ('\'') back into a single apostrophe.
|
|
stripped_doc=$("$strip_cmd" -i "$file" -)
|
|
|
|
# Next, convert the remainder it into plain text to remove the
|
|
# remaining markdown syntax.
|
|
#
|
|
# Before pandoc gets hold of it:
|
|
#
|
|
# - Replace pipes with spaces. This
|
|
# fixes an issue with old versions of pandoc (Ubuntu 16.04)
|
|
# which completely mangle tables into nonsense.
|
|
#
|
|
# - Remove empty reference links.
|
|
#
|
|
# For example, this markdown
|
|
#
|
|
# blah [`qemu-lite`][qemu-lite] blah.
|
|
# :
|
|
# [qemu-lite]: https://...
|
|
#
|
|
# Gets converted into
|
|
#
|
|
# blah [][qemu-lite] blah.
|
|
# :
|
|
# [qemu-lite]: https://...
|
|
#
|
|
# And the empty set of square brackets confuses pandoc.
|
|
#
|
|
# After pandoc has processed the data, remove any remaining
|
|
# "inline links" in this format:
|
|
#
|
|
# [link name](#link-address)
|
|
#
|
|
# This is strictly only required for old versions of pandoc.
|
|
|
|
pandoc_doc=$(echo "$stripped_doc" |\
|
|
tr '|' ' ' |\
|
|
sed 's/\[\]\[[^]]*\]//g' |\
|
|
"$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\
|
|
sed 's/\[[^]]*\]([^\)]*)//g' || true)
|
|
|
|
# Convert the file into "pure ASCII" by removing all awkward
|
|
# Unicode characters that won't spell check.
|
|
#
|
|
# Necessary since pandoc is "clever" and will convert things like
|
|
# GitHub's colon emojis (such as ":smile:") into the actual utf8
|
|
# character where possible.
|
|
utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii)
|
|
|
|
# Next, perform the following simplifications:
|
|
#
|
|
# - Remove URLs.
|
|
# - Remove email addresses.
|
|
# - Replace most punctuation symbols with a space
|
|
# (excluding a dash (aka hyphen!)
|
|
# - Carefully remove non-hyphen dashes.
|
|
# - Remove GitHub @userids.
|
|
pre_hunspell_doc=$(echo "$utf8_free_doc" |\
|
|
sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\
|
|
sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\
|
|
tr '[,\[\]()\*\\/\|=]' ' ' |\
|
|
sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\
|
|
sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g')
|
|
|
|
# Call the spell checker
|
|
hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}")
|
|
|
|
# Finally, post-process the hunspell output:
|
|
#
|
|
# - Parse the output to ignore:
|
|
# - Hunspell banner.
|
|
# - Correctly spelt words (lines starting with '*', '+' or '-').
|
|
# - All words containing numbers (like "100MB").
|
|
# - All words that appear to be acronymns / Abbreviations
|
|
# (atleast two upper-case letters and which may be plural or
|
|
# possessive).
|
|
# - All words that appear to be numbers.
|
|
# - All possessives and the dreaded isolated "'s" which occurs
|
|
# for input like this:
|
|
#
|
|
# `kata-shim`'s
|
|
#
|
|
# which gets converted by $strip_cmd into simply:
|
|
#
|
|
# 's
|
|
#
|
|
# - Sort output.
|
|
|
|
final_results=$(echo "$hunspell_results" |\
|
|
grep -Evi "(ispell|hunspell)" |\
|
|
grep -Ev '^(\*|\+|-)' |\
|
|
grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\
|
|
grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\
|
|
grep -Ev "^. 's" |\
|
|
sort -u || true)
|
|
|
|
local line
|
|
local incorrects
|
|
local near_misses
|
|
|
|
near_misses=$(echo "$final_results" | grep '^&' || true)
|
|
incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true)
|
|
|
|
local -i failed=0
|
|
|
|
[ -n "$near_misses" ] && failed+=1
|
|
[ -n "$incorrects" ] && failed+=1
|
|
|
|
echo "$near_misses" | while read -r line
|
|
do
|
|
[ "$line" = "" ] && continue
|
|
|
|
local word
|
|
local possibles
|
|
|
|
word=$(echo "$line" | awk '{print $2}')
|
|
possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g')
|
|
|
|
warn "Word '${word}': did you mean one of the following?: ${possibles}"
|
|
done
|
|
|
|
local incorrect
|
|
for incorrect in $incorrects
|
|
do
|
|
warn "Incorrect word: '$incorrect'"
|
|
done
|
|
|
|
[ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'"
|
|
|
|
info "Spell check successful for file: '$file'"
|
|
}
|
|
|
|
delete_dictionary()
|
|
{
|
|
rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}"
|
|
}
|
|
|
|
setup()
|
|
{
|
|
local cmd
|
|
|
|
for cmd in "$spell_check_cmd" "$md_convert_tool"
|
|
do
|
|
command -v "$cmd" &>/dev/null || die "Need $cmd command"
|
|
done
|
|
}
|
|
|
|
usage()
|
|
{
|
|
cat <<-EOF
|
|
Usage: ${script_name} <command> [arguments]
|
|
|
|
Description: Spell-checking utility.
|
|
|
|
Commands:
|
|
|
|
check <file> : Spell check the specified file
|
|
(implies 'make-dict').
|
|
delete-dict : Delete the dictionary.
|
|
help : Show this usage.
|
|
make-dict : Create the dictionary.
|
|
EOF
|
|
}
|
|
|
|
main()
|
|
{
|
|
setup
|
|
|
|
[ -z "${1:-}" ] && usage && echo && die "need command"
|
|
|
|
case "$1" in
|
|
check) shift && spell_check_file "$1" ;;
|
|
delete-dict) delete_dictionary ;;
|
|
help|-h|--help) usage && exit 0 ;;
|
|
make-dict) make_dictionary ;;
|
|
*) die "invalid command: '$1'" ;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|