From 7f3c12f1ddde070a664b385e9843ecdd8e581dda Mon Sep 17 00:00:00 2001 From: Chelsea Mafrica Date: Tue, 21 Nov 2023 17:47:16 -0800 Subject: [PATCH] tests: move spell check tool to main repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move tool as part of static checks migration. Fixes #8187 Signed-off-by: Bo Chen Signed-off-by: Carlos Venegas Signed-off-by: Chao Wu Signed-off-by: Chelsea Mafrica Signed-off-by: Dan Middleton Signed-off-by: Derek Lee Signed-off-by: Eric Ernst Signed-off-by: Fabiano Fidêncio Signed-off-by: Gabriela Cervantes Signed-off-by: Graham Whaley Signed-off-by: Hui Zhu Signed-off-by: James O. D. Hunt Signed-off-by: Jimmy Xu Signed-off-by: Liu Xiaodong Signed-off-by: Mikko Ylinen Signed-off-by: Shiming Zhang Signed-off-by: Snir Sheriber Signed-off-by: Wainer dos Santos Moschetta --- tests/cmd/check-spelling/README.md | 178 +++++++++ tests/cmd/check-spelling/data/acronyms.txt | 123 ++++++ tests/cmd/check-spelling/data/arches.txt | 21 + tests/cmd/check-spelling/data/distros.txt | 18 + tests/cmd/check-spelling/data/files.txt | 25 ++ tests/cmd/check-spelling/data/hunspell.txt | 13 + tests/cmd/check-spelling/data/main.txt | 135 +++++++ tests/cmd/check-spelling/data/projects.txt | 101 +++++ tests/cmd/check-spelling/data/rules.aff | 36 ++ tests/cmd/check-spelling/kata-dictionary.aff | 36 ++ tests/cmd/check-spelling/kata-dictionary.dic | 384 +++++++++++++++++++ tests/cmd/check-spelling/kata-spell-check.sh | 336 ++++++++++++++++ 12 files changed, 1406 insertions(+) create mode 100644 tests/cmd/check-spelling/README.md create mode 100644 tests/cmd/check-spelling/data/acronyms.txt create mode 100644 tests/cmd/check-spelling/data/arches.txt create mode 100644 tests/cmd/check-spelling/data/distros.txt create mode 100644 tests/cmd/check-spelling/data/files.txt create mode 100644 tests/cmd/check-spelling/data/hunspell.txt create mode 100644 tests/cmd/check-spelling/data/main.txt create mode 100644 tests/cmd/check-spelling/data/projects.txt create mode 100644 tests/cmd/check-spelling/data/rules.aff create mode 100644 tests/cmd/check-spelling/kata-dictionary.aff create mode 100644 tests/cmd/check-spelling/kata-dictionary.dic create mode 100755 tests/cmd/check-spelling/kata-spell-check.sh diff --git a/tests/cmd/check-spelling/README.md b/tests/cmd/check-spelling/README.md new file mode 100644 index 0000000000..b7b114f752 --- /dev/null +++ b/tests/cmd/check-spelling/README.md @@ -0,0 +1,178 @@ +# Spell check tool + +## Overview + +The `kata-spell-check.sh` tool is used to check a markdown file for +typographical (spelling) mistakes. + +## Approach + +The spell check tool is based on +[`hunspell`](https://github.com/hunspell/hunspell). It uses standard Hunspell +English dictionaries and supplements these with a custom Hunspell dictionary. +The document is cleaned of several entities before the spell-check begins. +These entities include the following: + +- URLs +- Email addresses +- Code blocks +- Most punctuation +- GitHub userids + +## Custom words + +A custom dictionary is required to accept specific words that are either well +understood by the community or are defined in various document files, but do +not appear in standard dictionaries. The custom dictionaries allow those words +to be accepted as correct. The following lists common examples of such words: + +- Abbreviations +- Acronyms +- Company names +- Product names +- Project names +- Technical terms + +## Spell check a document file + +```sh +$ ./kata-spell-check.sh check /path/to/file +``` + +> **Note:** If you have made local edits to the dictionaries, you may +> [re-create the master dictionary files](#create-the-master-dictionary-files) +> as documented in the [Adding a new word](#adding-a-new-word) section, +> in order for your local edits take effect. + +## Other options + +Lists all available options and commands: + +```sh +$ ./kata-spell-check.sh -h +``` + +## Technical details + +### Hunspell dictionary format + +A Hunspell dictionary comprises two text files: + +- A word list file + + This file defines a list of words (one per line). The list includes optional + references to one or more rules defined in the rules file as well as optional + comments. Specify fixed words (e.g. company names) verbatim. Enter “normal” + words in their root form. + + The root form of a "normal" word is the simplest and shortest form of that + word. For example, the following list of words are all formed from the root + word "computer": + + - Computers + - Computer’s + - Computing + - Computed + + Each word in the previous list is an example of using the word "computer" to + construct said word through a combination of applying the following + manipulations: + + - Remove one or more characters from the end of the word. + - Add a new ending. + + Therefore, you list the root word "computer" in the word list file. + +- A rules file + + This file defines named manipulations to apply to root words to form new + words. For example, rules that make a root word plural. + +### Source files + +The rules file and the the word list file for the custom dictionary generate +from "source" fragment files in the [`data`](data/) directory. + +All the fragment files allow comments using the hash (`#`) comment +symbol and all files contain a comment header explaining their content. + +#### Word list file fragments + +The `*.txt` files are word list file fragments. Splitting the word list +into fragments makes updates easier and clearer as each fragment is a +grouping of related terms. The name of the file gives a clue as to the +contents but the comments at the top of each file provide further +detail. + +Every line that does not start with a comment symbol contains a single +word. An optional comment for a word may appear after the word and is +separated from the word by whitespace followed by the comment symbol: + +``` +word # This is a comment explaining this particular word list entry. +``` + +You *may* suffix each word by a forward slash followed by one or more +upper-case letters. Each letter refers to a rule name in the rules file: + +``` +word/AC # This word references the 'A' and 'C' rules. +``` + +#### Rules file + +The [rules file](data/rules.aff) contains a set of general rules that can be +applied to one or more root words in the word list files. You can make +comments in the rules file. + +For an explanation of the format of this file see +[`man 5 hunspell`](http://www.manpagez.com/man/5/hunspell) +([source](https://github.com/hunspell/hunspell/blob/master/man/hunspell.5)). + +## Adding a new word + +### Update the word list fragment + +If you want to allow a new word to the dictionary, + +- Check to ensure you do need to add the word + + Is the word valid and correct? If the word is a project, product, + or company name, is the capitalization correct? + +- Add the new word to the appropriate [word list fragment file](data). + + Specifically, if it is a general word, add the *root* of the word to + the appropriate fragment file. + +- Add a `/` suffix along with the letters for each rule to apply in order to + add rules references. + +### Optionally update the rules file + +It should not generally be necessary to update the rules file since it +already contains rules for most scenarios. However, if you need to +update the file, [read the documentation carefully](#rules-file). + +### Create the master dictionary files + +Every time you change the dictionary files you must recreate the master +dictionary files: + +```sh +$ ./kata-spell-check.sh make-dict +``` + +As a convenience, [checking a file](#spell-check-a-document-file) will +automatically create the database. + +### Test the changes + +You must test any changes to the [word list file +fragments](#word-list-file-fragments) or the [rules file](#rules-file) +by doing the following: + +1. Recreate the [master dictionary files](#create-the-master-dictionary-files). + +1. [Run the spell checker](#spell-check-a-document-file) on a file containing the + words you have added to the dictionary. diff --git a/tests/cmd/check-spelling/data/acronyms.txt b/tests/cmd/check-spelling/data/acronyms.txt new file mode 100644 index 0000000000..3be0907cd3 --- /dev/null +++ b/tests/cmd/check-spelling/data/acronyms.txt @@ -0,0 +1,123 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: List of acronyms and abbreviations. + +ACPI/AB +ACS/AB +API/AB +AUFS # Another Union FS +AWS/AB +BDF/AB +CFS/AB +CLI/AB +CNI/AB +CNM/AB +CPUID/AB +CRI/AB +CVE/AB +DAX/AB +DinD/B # Docker in Docker +dind/B +DMA/AB +DPDK/AB +FaaS/B # Function as a Service +FS/AB +fs/B # For terms like "virtio-fs" +GCE/AB +GOPATH/AB +GPG/AB +GPU/AB +gRPC/AB +GSC/AB +GVT/AB +IaaS/B # Infrastructure as a Service +IOMMU/AB +IoT/AB # Internet of Things +IOV/AB +JSON/AB +k8s/B +KCSA/AB +KSM/AB +KVM/AB +LTS/AB +MACVTAP/AB +mem/B # For terms like "virtio-mem" +memdisk/B +MDEV/AB +NEMU/AB +NIC/AB +NVDIMM/AB +OCI/AB +OVMF/AB +OverlayFS/B +PaaS/B # Platform as a Service +PCDIMM/AB +PCI/AB +PCIe/AB +PID/AB +pmem/B # persistent memory +PNG/AB +POD/AB +PR/AB +PSS/AB +QA/AB +QAT/AB +QEMU/AB +RBAC/AB +RDMA/AB +RNG/AB +SaaS/B # Software as a Service +SCSI/AB +SDK/AB +seccomp # secure computing mode +SHA/AB +SPDX/AB +SRIOV/AB +SVG/AB +TBD/AB +TOC/AB +TOML/AB +TTY/AB +UI/AB +UTS/AB +UUID/AB +vCPU/AB +VETH/AB +VF/AB +VFIO/AB +VGPU/AB +vhost/AB +VHOST/AB +virtio/AB +VirtIO/AB +Virtio-fs/AB +Virtio-mem/AB +VLAN/AB +VM/AB +VMCache/AB +vmm +VMM/AB +VMX/AB +VPP/AB +VSOCK/AB +VSS/AB +WIP/AB # Work In Progress +WRT/AB # With Respect To +XIP/AB +YAML/AB +irq/AB +mmio/AB +APIC +msg/AB +UDS +dbs # Dragonball Sandbox +TDX +tdx +mptable +fdt +gic +msr +cpuid +pio diff --git a/tests/cmd/check-spelling/data/arches.txt b/tests/cmd/check-spelling/data/arches.txt new file mode 100644 index 0000000000..08fa55d850 --- /dev/null +++ b/tests/cmd/check-spelling/data/arches.txt @@ -0,0 +1,21 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: List of architectures. + +# Architectures + +aarch64/B +amd64/B +arm64/B +ppc64el/B +ppc64le/B +s390x/B +x86_64/B +x86/B + +# Micro architecture names + +Haswell/B +Ivybridge/B diff --git a/tests/cmd/check-spelling/data/distros.txt b/tests/cmd/check-spelling/data/distros.txt new file mode 100644 index 0000000000..1edca51f8a --- /dev/null +++ b/tests/cmd/check-spelling/data/distros.txt @@ -0,0 +1,18 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: List of Linux Distributions. + +CentOS/B +Debian/B +EulerOS/B +Fedora/B +macOS/B +MacOS/B +minikube/B +openSUSE/B +OpenSUSE/B +RHEL/B +SLES/B +Ubuntu/B diff --git a/tests/cmd/check-spelling/data/files.txt b/tests/cmd/check-spelling/data/files.txt new file mode 100644 index 0000000000..5fa4fc1168 --- /dev/null +++ b/tests/cmd/check-spelling/data/files.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: Names of commands, files and packages. +# +# Notes: These *should* strictly be placed in backticks but alas this +# doesn't always happen. +# +# References: https://github.com/kata-containers/kata-containers/blob/main/docs/Documentation-Requirements.md#files-and-command-names + +cgroup/AB +coredump/A +cpuset/AB +Dockerfile/AB +init/AB +initramfs/AB +initrd/AB +netns/AB +rootfs/AB +stderr/AB +stdin/AB +stdout/AB +syslog/AB +Vagrantfile/B diff --git a/tests/cmd/check-spelling/data/hunspell.txt b/tests/cmd/check-spelling/data/hunspell.txt new file mode 100644 index 0000000000..feae4b539a --- /dev/null +++ b/tests/cmd/check-spelling/data/hunspell.txt @@ -0,0 +1,13 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: List of words that are missing from Hunspell dictionaries +# on some platforms. + +committer/AB # Not available on Ubuntu 16.04 or CentOS 7 +plugin/AB # Not available on Ubuntu 16.04 +regexp/AB # Not available on Ubuntu 16.04 +screenshot/AB # Not available on Ubuntu 16.04 or CentOS 7 +tarball/AB # Not available on Ubuntu 16.04 +uninstall # Not available on Ubuntu 16.04 diff --git a/tests/cmd/check-spelling/data/main.txt b/tests/cmd/check-spelling/data/main.txt new file mode 100644 index 0000000000..3fcf4e5076 --- /dev/null +++ b/tests/cmd/check-spelling/data/main.txt @@ -0,0 +1,135 @@ +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: General word list. + +ack/A +arg # Argument +auditability +backend +backport/ACD +backtick/AB +backtrace +bootloader/AB +centric/B +checkbox/A +chipset/AB +codebase +commandline +config/AB +crypto # Cryptography +cryptoprocessor/AB +DaemonSet/AB +deliverable/AB +dev +devicemapper/B +deploy +dialer +dialog/A +Diffie/B # Diffie–Hellman (cryptography) +distro/AB +emptydir/A +enablement/AB +entrypoint/AB +ethernet +filename/AB +filesystem/AB +freeform +goroutine/AB +hostname/AB +hotplug/ACD +howto/AB +HugePage/AB +hugepage/AB +Hyp +hypercall/A +hypervisor/AB +implementer/A +implementor/A +Infiniband +iodepth/A +ioengine/A +iptables +Itanium/AB +kata +Kat/AB # "Kat Herding Team" :) +keypair/A +lifecycle/A +linter/AB +logfile/A +Longterm +longterm +loopback +memcpy/A +mergeable +metadata +microcontroller/AB +miniOS +mmap/AB +nack/AB +namespace/ABCD +netlink +NVIDIA/A +nvidia/A +onwards +OpenAPI +OS/AB +parallelize/AC +passthrough +patchset/A +pluggable/AB +portmapper/AB +portmapping/A +pre +prefetch/ACD +prestart +programmatically +proxying +Quadro +ramdisk/A +readonly +rebase/ACD +refactor/ACD +remediate +repo/A +runtime/AB +scalability +serverless +signoff/A +stalebot/B +startup +subdirectory/A +swappiness +sysctl/AB +teardown +templating +timestamp/AB +tracability +ttRPC/B +udev/B +uevent/AB +unbootable +uncomment/ACD +unported +unskip/AC +untrusted +untrusting +userid/AB +userspace/B +vendored +vendoring +versioning +vGPU +virtualization +virtualized +webhook/AB +whitespace +workflow/A +Xeon/A +yaml +upcall +Upcall +ioctl/A +struct/A # struct in Rust +Struct/A \ No newline at end of file diff --git a/tests/cmd/check-spelling/data/projects.txt b/tests/cmd/check-spelling/data/projects.txt new file mode 100644 index 0000000000..963de14158 --- /dev/null +++ b/tests/cmd/check-spelling/data/projects.txt @@ -0,0 +1,101 @@ +# Copyright (c) 2019-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: Names of projects, companies and services. + +Ansible/B +AppArmor/B +blogbench/B +BusyBox/B +Cassandra/B +ccloudvm/B +codecov/B +containerd/B +cnn/B +cri-o/B +CRI-O/B +DevStack/B +Django/B +Docker/B +dracut/B +Dragonball/B +Facebook/B +fio/B +Fluentd/B +Frakti/B +Git/B +GitHub/B +GoDoc/B +golang/B +Golang/B +Grafana/B +Gramine/B +Huawei/B +Inclavare/B +iPerf/B +IPerf/B +Istio/B +Jaeger/B +Jenkins/B +Jupyter/B +journald/B +jq/B +Kata/B +Kibana/B +Kubelet/B +Kubernetes/B +Launchpad/B +LevelDB/B +libcontainer/B +libelf/B +libvirt/B +Linkerd/B +LinuxONE/B +Logrus/B +Logstash/B +Mellanox/B +Minikube/B +MITRE/B +musl/B +Netlify/B +Nginx/B +OpenCensus/B +OpenPGP/B +OpenShift/B +OpenSSL/B +OpenStack/B +OpenTelemetry/B +OpenTracing/B +osbuilder/B +packagecloud/B +Pandoc/B +Podman/B +PullApprove/B +Pytorch/B +QuickAssist/B +R/B +raytracer/B +rkt/B/B +runc/B +runV/B +rustlang/B +Rustlang/B +SELinux/B +SemaphoreCI/B +snapcraft/B +snapd/B +SQLite/B +SUSE/B +Sysbench/B +systemd/B +tf/B +TravisCI/B +Tokio/B +Vexxhost/B +virtcontainers/B +VMWare/B +vSphere/B +Yamux/B +yq/B +Zun/B diff --git a/tests/cmd/check-spelling/data/rules.aff b/tests/cmd/check-spelling/data/rules.aff new file mode 100644 index 0000000000..7f37dbf477 --- /dev/null +++ b/tests/cmd/check-spelling/data/rules.aff @@ -0,0 +1,36 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +SET UTF-8 + +# Add the following characters so they are accepted as part of a word +WORDCHARS 0123456789' + +# Disable hyphenation +BREAK 0 + +# plural +SFX A N 3 +SFX A 0 s [^x] +SFX A 0 es x +SFX A y ies + +# possession +SFX B N 1 +SFX B 0 's + +# past tense +SFX C N 4 +SFX C 0 d e +SFX C 0 ed [rt] +SFX C 0 ped p +SFX C 0 ged g + +# present continuous +SFX D N 3 +SFX D 0 ging g +SFX D 0 ing [rt] +SFX D e ing e diff --git a/tests/cmd/check-spelling/kata-dictionary.aff b/tests/cmd/check-spelling/kata-dictionary.aff new file mode 100644 index 0000000000..7f37dbf477 --- /dev/null +++ b/tests/cmd/check-spelling/kata-dictionary.aff @@ -0,0 +1,36 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +SET UTF-8 + +# Add the following characters so they are accepted as part of a word +WORDCHARS 0123456789' + +# Disable hyphenation +BREAK 0 + +# plural +SFX A N 3 +SFX A 0 s [^x] +SFX A 0 es x +SFX A y ies + +# possession +SFX B N 1 +SFX B 0 's + +# past tense +SFX C N 4 +SFX C 0 d e +SFX C 0 ed [rt] +SFX C 0 ped p +SFX C 0 ged g + +# present continuous +SFX D N 3 +SFX D 0 ging g +SFX D 0 ing [rt] +SFX D e ing e diff --git a/tests/cmd/check-spelling/kata-dictionary.dic b/tests/cmd/check-spelling/kata-dictionary.dic new file mode 100644 index 0000000000..33d41e37ec --- /dev/null +++ b/tests/cmd/check-spelling/kata-dictionary.dic @@ -0,0 +1,384 @@ +383 +ACPI/AB +ACS/AB +API/AB +APIC +AUFS +AWS/AB +Ansible/B +AppArmor/B +BDF/AB +BusyBox/B +CFS/AB +CLI/AB +CNI/AB +CNM/AB +CPUID/AB +CRI-O/B +CRI/AB +CVE/AB +Cassandra/B +CentOS/B +DAX/AB +DMA/AB +DPDK/AB +DaemonSet/AB +Debian/B +DevStack/B +Diffie/B +DinD/B +Django/B +Docker/B +Dockerfile/AB +Dragonball/B +EulerOS/B +FS/AB +FaaS/B +Facebook/B +Fedora/B +Fluentd/B +Frakti/B +GCE/AB +GOPATH/AB +GPG/AB +GPU/AB +GSC/AB +GVT/AB +Git/B +GitHub/B +GoDoc/B +Golang/B +Grafana/B +Gramine/B +Haswell/B +Huawei/B +HugePage/AB +Hyp +IOMMU/AB +IOV/AB +IPerf/B +IaaS/B +Inclavare/B +Infiniband +IoT/AB +Istio/B +Itanium/AB +Ivybridge/B +JSON/AB +Jaeger/B +Jenkins/B +Jupyter/B +KCSA/AB +KSM/AB +KVM/AB +Kat/AB +Kata/B +Kibana/B +Kubelet/B +Kubernetes/B +LTS/AB +Launchpad/B +LevelDB/B +Linkerd/B +LinuxONE/B +Logrus/B +Logstash/B +Longterm +MACVTAP/AB +MDEV/AB +MITRE/B +MacOS/B +Mellanox/B +Minikube/B +NEMU/AB +NIC/AB +NVDIMM/AB +NVIDIA/A +Netlify/B +Nginx/B +OCI/AB +OS/AB +OVMF/AB +OpenAPI +OpenCensus/B +OpenPGP/B +OpenSSL/B +OpenSUSE/B +OpenShift/B +OpenStack/B +OpenTelemetry/B +OpenTracing/B +OverlayFS/B +PCDIMM/AB +PCI/AB +PCIe/AB +PID/AB +PNG/AB +POD/AB +PR/AB +PSS/AB +PaaS/B +Pandoc/B +Podman/B +PullApprove/B +Pytorch/B +QA/AB +QAT/AB +QEMU/AB +Quadro +QuickAssist/B +R/B +RBAC/AB +RDMA/AB +RHEL/B +RNG/AB +Rustlang/B +SCSI/AB +SDK/AB +SELinux/B +SHA/AB +SLES/B +SPDX/AB +SQLite/B +SRIOV/AB +SUSE/B +SVG/AB +SaaS/B +SemaphoreCI/B +Struct/A# +Sysbench/B +TBD/AB +TDX +TOC/AB +TOML/AB +TTY/AB +Tokio/B +TravisCI/B +UDS +UI/AB +UTS/AB +UUID/AB +Ubuntu/B +Upcall +VETH/AB +VF/AB +VFIO/AB +VGPU/AB +VHOST/AB +VLAN/AB +VM/AB +VMCache/AB +VMM/AB +VMWare/B +VMX/AB +VPP/AB +VSOCK/AB +VSS/AB +Vagrantfile/B +Vexxhost/B +VirtIO/AB +Virtio-fs/AB +Virtio-mem/AB +WIP/AB +WRT/AB +XIP/AB +Xeon/A +YAML/AB +Yamux/B +Zun/B +aarch64/B +ack/A +amd64/B +arg +arm64/B +auditability +backend +backport/ACD +backtick/AB +backtrace +blogbench/B +bootloader/AB +ccloudvm/B +centric/B +cgroup/AB +checkbox/A +chipset/AB +cnn/B +codebase +codecov/B +commandline +committer/AB +config/AB +containerd/B +coredump/A +cpuid +cpuset/AB +cri-o/B +crypto +cryptoprocessor/AB +dbs +deliverable/AB +deploy +dev +devicemapper/B +dialer +dialog/A +dind/B +distro/AB +dracut/B +emptydir/A +enablement/AB +entrypoint/AB +ethernet +fdt +filename/AB +filesystem/AB +fio/B +freeform +fs/B +gRPC/AB +gic +golang/B +goroutine/AB +hostname/AB +hotplug/ACD +howto/AB +hugepage/AB +hypercall/A +hypervisor/AB +iPerf/B +implementer/A +implementor/A +init/AB +initramfs/AB +initrd/AB +ioctl/A +iodepth/A +ioengine/A +iptables +irq/AB +journald/B +jq/B +k8s/B +kata +keypair/A +libcontainer/B +libelf/B +libvirt/B +lifecycle/A +linter/AB +logfile/A +longterm +loopback +macOS/B +mem/B +memcpy/A +memdisk/B +mergeable +metadata +microcontroller/AB +miniOS +minikube/B +mmap/AB +mmio/AB +mptable +msg/AB +msr +musl/B +nack/AB +namespace/ABCD +netlink +netns/AB +nvidia/A +onwards +openSUSE/B +osbuilder/B +packagecloud/B +parallelize/AC +passthrough +patchset/A +pio +pluggable/AB +plugin/AB +pmem/B +portmapper/AB +portmapping/A +ppc64el/B +ppc64le/B +pre +prefetch/ACD +prestart +programmatically +proxying +ramdisk/A +raytracer/B +readonly +rebase/ACD +refactor/ACD +regexp/AB +remediate +repo/A +rkt/B/B +rootfs/AB +runV/B +runc/B +runtime/AB +rustlang/B +s390x/B +scalability +screenshot/AB +seccomp +serverless +signoff/A +snapcraft/B +snapd/B +stalebot/B +startup +stderr/AB +stdin/AB +stdout/AB +struct/A +subdirectory/A +swappiness +sysctl/AB +syslog/AB +systemd/B +tarball/AB +tdx +teardown +templating +tf/B +timestamp/AB +tracability +ttRPC/B +udev/B +uevent/AB +unbootable +uncomment/ACD +uninstall +unported +unskip/AC +untrusted +untrusting +upcall +userid/AB +userspace/B +vCPU/AB +vGPU +vSphere/B +vendored +vendoring +versioning +vhost/AB +virtcontainers/B +virtio/AB +virtualization +virtualized +vmm +webhook/AB +whitespace +workflow/A +x86/B +x86_64/B +yaml +yq/B diff --git a/tests/cmd/check-spelling/kata-spell-check.sh b/tests/cmd/check-spelling/kata-spell-check.sh new file mode 100755 index 0000000000..ca7b2b8f09 --- /dev/null +++ b/tests/cmd/check-spelling/kata-spell-check.sh @@ -0,0 +1,336 @@ +#!/bin/bash +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description: spell-check utility. + +[ -n "$DEBUG" ] && set -x + +set -o errexit +set -o pipefail +set -o nounset + +# Ensure we spell check in English +LANG=C +LC_ALL=C + +script_name=${0##*/} + +if [ "$(uname -s)" == "Darwin" ] +then + # Hunspell dictionaries are a not easily available + # on this platform it seems. + echo "INFO: $script_name: OSX not supported - exiting" + exit 0 +fi + +self_dir=$(dirname "$(readlink -f "$0")") +cidir="${self_dir}/../../.ci" + +source "${cidir}/lib.sh" + +# Directory containing word lists. +# +# Each file in this directory must: +# +# - Have the ".txt" extension. +# - Contain one word per line. +# +# Additionally, the files may contain blank lines and comments +# (lines beginning with '#'). +KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data} + +KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}" + +# Name of dictionary file suitable for using with hunspell(1) +# as a personal dictionary. +KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}" + +KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}" + +# command to remove code from markdown (inline and blocks) +strip_cmd="${cidir}/kata-doc-to-script.sh" + +fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}" + +# Name of file containing dictionary rules that apply to the +# KATA_DICT_FILE word list. +rules_file_name="rules.aff" + +# Command to spell check a file +spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}" + +# Command to convert a markdown file into plain text +md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}" + +KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}" +dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}" +rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}" + +# Hunspell refers to custom dictionary by their path followed by the name of +# the dictionary (without the file extension). +kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}" + +# All project documentation must be written in English, +# with American English taking priority. +# +# We also use a custom dictionary which has to be specified by its +# "directory and name prefix" and which must also be the first specified +# dictionary. +dict_languages="${kata_dict_ref},en_US,en_GB" + +make_dictionary() +{ + [ -d "$fragment_dir" ] || die "invalid fragment directory" + [ -z "$dict_file" ] && die "missing dictionary output file name" + + # Note: the first field is extracted to allow for inline + # comments in each fragment. For example: + # + # word # this text describes why the word is in the dictionary. + # + local dict + + dict=$(cat "$fragment_dir"/*.txt |\ + grep -v '^\#' |\ + grep -v '^$' |\ + awk '{print $1}' |\ + sort -u || true) + + [ -z "$dict" ] && die "generated dictionary is empty" + + # Now, add in the number of words as a header (required by Hunspell) + local count + + count=$(echo "$dict"| wc -l | awk '{print $1}' || true) + [ -z "$count" ] && die "cannot determine dictionary length" + [ "$count" -eq 0 ] && die "invalid dictionary length" + + # Construct the dictionary + (echo "$count"; echo "$dict") > "$dict_file" + + cp "${fragment_dir}/${rules_file_name}" "${rules_file}" +} + +spell_check_file() +{ + local file="$1" + + [ -z "$file" ] && die "need file to check" + [ -e "$file" ] || die "file does not exist: '$file'" + + [ -e "$dict_file" ] || make_dictionary + + info "Spell checking file '$file'" + + # Determine the pandoc input format. + local pandoc_input_fmts + local pandoc_input_fmt + + local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true) + + if [ -z "$pandoc_input_fmts" ] + then + # We're using a very old version of pandoc that doesn't + # support listing its available input formats, so + # specify a default. + pandoc_input_fmt="markdown_github" + else + # Pandoc has multiple names for the gfm parser so find one of them + pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\ + grep -E "gfm|github" |\ + head -1 || true) + fi + + [ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format" + + local stripped_doc + + local pandoc_doc + local utf8_free_doc + local pre_hunspell_doc + local hunspell_results + local final_results + + # First strip out all code blocks and convert all + # "quoted apostrophe's" ('\'') back into a single apostrophe. + stripped_doc=$("$strip_cmd" -i "$file" -) + + # Next, convert the remainder it into plain text to remove the + # remaining markdown syntax. + # + # Before pandoc gets hold of it: + # + # - Replace pipes with spaces. This + # fixes an issue with old versions of pandoc (Ubuntu 16.04) + # which completely mangle tables into nonsense. + # + # - Remove empty reference links. + # + # For example, this markdown + # + # blah [`qemu-lite`][qemu-lite] blah. + # : + # [qemu-lite]: https://... + # + # Gets converted into + # + # blah [][qemu-lite] blah. + # : + # [qemu-lite]: https://... + # + # And the empty set of square brackets confuses pandoc. + # + # After pandoc has processed the data, remove any remaining + # "inline links" in this format: + # + # [link name](#link-address) + # + # This is strictly only required for old versions of pandoc. + + pandoc_doc=$(echo "$stripped_doc" |\ + tr '|' ' ' |\ + sed 's/\[\]\[[^]]*\]//g' |\ + "$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\ + sed 's/\[[^]]*\]([^\)]*)//g' || true) + + # Convert the file into "pure ASCII" by removing all awkward + # Unicode characters that won't spell check. + # + # Necessary since pandoc is "clever" and will convert things like + # GitHub's colon emojis (such as ":smile:") into the actual utf8 + # character where possible. + utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii) + + # Next, perform the following simplifications: + # + # - Remove URLs. + # - Remove email addresses. + # - Replace most punctuation symbols with a space + # (excluding a dash (aka hyphen!) + # - Carefully remove non-hyphen dashes. + # - Remove GitHub @userids. + pre_hunspell_doc=$(echo "$utf8_free_doc" |\ + sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\ + sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\ + tr '[,\[\]()\*\\/\|=]' ' ' |\ + sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\ + sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g') + + # Call the spell checker + hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}") + + # Finally, post-process the hunspell output: + # + # - Parse the output to ignore: + # - Hunspell banner. + # - Correctly spelt words (lines starting with '*', '+' or '-'). + # - All words containing numbers (like "100MB"). + # - All words that appear to be acronymns / Abbreviations + # (atleast two upper-case letters and which may be plural or + # possessive). + # - All words that appear to be numbers. + # - All possessives and the dreaded isolated "'s" which occurs + # for input like this: + # + # `kata-shim`'s + # + # which gets converted by $strip_cmd into simply: + # + # 's + # + # - Sort output. + + final_results=$(echo "$hunspell_results" |\ + grep -Evi "(ispell|hunspell)" |\ + grep -Ev '^(\*|\+|-)' |\ + grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\ + grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\ + grep -Ev "^. 's" |\ + sort -u || true) + + local line + local incorrects + local near_misses + + near_misses=$(echo "$final_results" | grep '^&' || true) + incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true) + + local -i failed=0 + + [ -n "$near_misses" ] && failed+=1 + [ -n "$incorrects" ] && failed+=1 + + echo "$near_misses" | while read -r line + do + [ "$line" = "" ] && continue + + local word + local possibles + + word=$(echo "$line" | awk '{print $2}') + possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g') + + warn "Word '${word}': did you mean one of the following?: ${possibles}" + done + + local incorrect + for incorrect in $incorrects + do + warn "Incorrect word: '$incorrect'" + done + + [ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'" + + info "Spell check successful for file: '$file'" +} + +delete_dictionary() +{ + rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}" +} + +setup() +{ + local cmd + + for cmd in "$spell_check_cmd" "$md_convert_tool" + do + command -v "$cmd" &>/dev/null || die "Need $cmd command" + done +} + +usage() +{ + cat <<-EOF + Usage: ${script_name} [arguments] + + Description: Spell-checking utility. + + Commands: + + check : Spell check the specified file + (implies 'make-dict'). + delete-dict : Delete the dictionary. + help : Show this usage. + make-dict : Create the dictionary. +EOF +} + +main() +{ + setup + + [ -z "${1:-}" ] && usage && echo && die "need command" + + case "$1" in + check) shift && spell_check_file "$1" ;; + delete-dict) delete_dictionary ;; + help|-h|--help) usage && exit 0 ;; + make-dict) make_dictionary ;; + *) die "invalid command: '$1'" ;; + esac +} + +main "$@"