kata-containers/tests/cmd/check-spelling/kata-spell-check.sh
Chelsea Mafrica 63636b869c static-checks: Update copyright dates
Some copyright dates were not updated with the most recent changes to
code; update them.

Fixes #8595

Signed-off-by: Chelsea Mafrica <chelsea.e.mafrica@intel.com>
2023-12-12 16:34:06 -08:00

337 lines
8.6 KiB
Bash
Executable File

#!/bin/bash
# Copyright (c) 2019-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# Description: spell-check utility.
[ -n "$DEBUG" ] && set -x
set -o errexit
set -o pipefail
set -o nounset
# Ensure we spell check in English
LANG=C
LC_ALL=C
script_name=${0##*/}
if [ "$(uname -s)" == "Darwin" ]
then
# Hunspell dictionaries are a not easily available
# on this platform it seems.
echo "INFO: $script_name: OSX not supported - exiting"
exit 0
fi
self_dir=$(dirname "$(readlink -f "$0")")
cidir="${self_dir}/../../../tests"
source "${cidir}/common.bash"
# Directory containing word lists.
#
# Each file in this directory must:
#
# - Have the ".txt" extension.
# - Contain one word per line.
#
# Additionally, the files may contain blank lines and comments
# (lines beginning with '#').
KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data}
KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}"
# Name of dictionary file suitable for using with hunspell(1)
# as a personal dictionary.
KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}"
KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}"
# command to remove code from markdown (inline and blocks)
strip_cmd="${cidir}/kata-doc-to-script.sh"
fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}"
# Name of file containing dictionary rules that apply to the
# KATA_DICT_FILE word list.
rules_file_name="rules.aff"
# Command to spell check a file
spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}"
# Command to convert a markdown file into plain text
md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}"
KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}"
dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}"
rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}"
# Hunspell refers to custom dictionary by their path followed by the name of
# the dictionary (without the file extension).
kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}"
# All project documentation must be written in English,
# with American English taking priority.
#
# We also use a custom dictionary which has to be specified by its
# "directory and name prefix" and which must also be the first specified
# dictionary.
dict_languages="${kata_dict_ref},en_US,en_GB"
make_dictionary()
{
[ -d "$fragment_dir" ] || die "invalid fragment directory"
[ -z "$dict_file" ] && die "missing dictionary output file name"
# Note: the first field is extracted to allow for inline
# comments in each fragment. For example:
#
# word # this text describes why the word is in the dictionary.
#
local dict
dict=$(cat "$fragment_dir"/*.txt |\
grep -v '^\#' |\
grep -v '^$' |\
awk '{print $1}' |\
sort -u || true)
[ -z "$dict" ] && die "generated dictionary is empty"
# Now, add in the number of words as a header (required by Hunspell)
local count
count=$(echo "$dict"| wc -l | awk '{print $1}' || true)
[ -z "$count" ] && die "cannot determine dictionary length"
[ "$count" -eq 0 ] && die "invalid dictionary length"
# Construct the dictionary
(echo "$count"; echo "$dict") > "$dict_file"
cp "${fragment_dir}/${rules_file_name}" "${rules_file}"
}
spell_check_file()
{
local file="$1"
[ -z "$file" ] && die "need file to check"
[ -e "$file" ] || die "file does not exist: '$file'"
[ -e "$dict_file" ] || make_dictionary
info "Spell checking file '$file'"
# Determine the pandoc input format.
local pandoc_input_fmts
local pandoc_input_fmt
local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true)
if [ -z "$pandoc_input_fmts" ]
then
# We're using a very old version of pandoc that doesn't
# support listing its available input formats, so
# specify a default.
pandoc_input_fmt="markdown_github"
else
# Pandoc has multiple names for the gfm parser so find one of them
pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\
grep -E "gfm|github" |\
head -1 || true)
fi
[ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format"
local stripped_doc
local pandoc_doc
local utf8_free_doc
local pre_hunspell_doc
local hunspell_results
local final_results
# First strip out all code blocks and convert all
# "quoted apostrophe's" ('\'') back into a single apostrophe.
stripped_doc=$("$strip_cmd" -i "$file" -)
# Next, convert the remainder it into plain text to remove the
# remaining markdown syntax.
#
# Before pandoc gets hold of it:
#
# - Replace pipes with spaces. This
# fixes an issue with old versions of pandoc (Ubuntu 16.04)
# which completely mangle tables into nonsense.
#
# - Remove empty reference links.
#
# For example, this markdown
#
# blah [`qemu-lite`][qemu-lite] blah.
# :
# [qemu-lite]: https://...
#
# Gets converted into
#
# blah [][qemu-lite] blah.
# :
# [qemu-lite]: https://...
#
# And the empty set of square brackets confuses pandoc.
#
# After pandoc has processed the data, remove any remaining
# "inline links" in this format:
#
# [link name](#link-address)
#
# This is strictly only required for old versions of pandoc.
pandoc_doc=$(echo "$stripped_doc" |\
tr '|' ' ' |\
sed 's/\[\]\[[^]]*\]//g' |\
"$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\
sed 's/\[[^]]*\]([^\)]*)//g' || true)
# Convert the file into "pure ASCII" by removing all awkward
# Unicode characters that won't spell check.
#
# Necessary since pandoc is "clever" and will convert things like
# GitHub's colon emojis (such as ":smile:") into the actual utf8
# character where possible.
utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii)
# Next, perform the following simplifications:
#
# - Remove URLs.
# - Remove email addresses.
# - Replace most punctuation symbols with a space
# (excluding a dash (aka hyphen!)
# - Carefully remove non-hyphen dashes.
# - Remove GitHub @userids.
pre_hunspell_doc=$(echo "$utf8_free_doc" |\
sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\
sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\
tr '[,\[\]()\*\\/\|=]' ' ' |\
sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\
sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g')
# Call the spell checker
hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}")
# Finally, post-process the hunspell output:
#
# - Parse the output to ignore:
# - Hunspell banner.
# - Correctly spelt words (lines starting with '*', '+' or '-').
# - All words containing numbers (like "100MB").
# - All words that appear to be acronymns / Abbreviations
# (atleast two upper-case letters and which may be plural or
# possessive).
# - All words that appear to be numbers.
# - All possessives and the dreaded isolated "'s" which occurs
# for input like this:
#
# `kata-shim`'s
#
# which gets converted by $strip_cmd into simply:
#
# 's
#
# - Sort output.
final_results=$(echo "$hunspell_results" |\
grep -Evi "(ispell|hunspell)" |\
grep -Ev '^(\*|\+|-)' |\
grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\
grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\
grep -Ev "^. 's" |\
sort -u || true)
local line
local incorrects
local near_misses
near_misses=$(echo "$final_results" | grep '^&' || true)
incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true)
local -i failed=0
[ -n "$near_misses" ] && failed+=1
[ -n "$incorrects" ] && failed+=1
echo "$near_misses" | while read -r line
do
[ "$line" = "" ] && continue
local word
local possibles
word=$(echo "$line" | awk '{print $2}')
possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g')
warn "Word '${word}': did you mean one of the following?: ${possibles}"
done
local incorrect
for incorrect in $incorrects
do
warn "Incorrect word: '$incorrect'"
done
[ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'"
info "Spell check successful for file: '$file'"
}
delete_dictionary()
{
rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}"
}
setup()
{
local cmd
for cmd in "$spell_check_cmd" "$md_convert_tool"
do
command -v "$cmd" &>/dev/null || die "Need $cmd command"
done
}
usage()
{
cat <<-EOF
Usage: ${script_name} <command> [arguments]
Description: Spell-checking utility.
Commands:
check <file> : Spell check the specified file
(implies 'make-dict').
delete-dict : Delete the dictionary.
help : Show this usage.
make-dict : Create the dictionary.
EOF
}
main()
{
setup
[ -z "${1:-}" ] && usage && echo && die "need command"
case "$1" in
check) shift && spell_check_file "$1" ;;
delete-dict) delete_dictionary ;;
help|-h|--help) usage && exit 0 ;;
make-dict) make_dictionary ;;
*) die "invalid command: '$1'" ;;
esac
}
main "$@"