diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py
index 4bceb9782dd..ae4fdb6e124 100644
--- a/langchain/evaluation/__init__.py
+++ b/langchain/evaluation/__init__.py
@@ -5,20 +5,21 @@ LangChain primitives such as language models and chains.
 
 Some common use cases for evaluation include:
 
-- Grading the accuracy of a response against ground truth answers: QAEvalChain
-- Comparing the output of two models: PairwiseStringEvalChain
-- Judging the efficacy of an agent's tool usage: TrajectoryEvalChain
-- Checking whether an output complies with a set of criteria: CriteriaEvalChain
+- Grading the accuracy of a response against ground truth answers: :class:`langchain.evaluation.qa.eval_chain.QAEvalChain`
+- Comparing the output of two models: :class:`langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain`
+- Judging the efficacy of an agent's tool usage: :class:`langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain`
+- Checking whether an output complies with a set of criteria: :class:`langchain.evaluation.criteria.eval_chain.CriteriaEvalChain`
 
 This module also contains low-level APIs for creating custom evaluators for
 specific evaluation tasks. These include:
-- StringEvaluator: Evaluates an output string against a reference and/or input context.
-- PairwiseStringEvaluator: Evaluates two strings against each other.
+
+- :class:`langchain.evaluation.schema.StringEvaluator`: Evaluates an output string against a reference and/or input context.
+- :class:`langchain.evaluation.schema.PairwiseStringEvaluator`: Evaluates two strings against each other.
 
 
 For loading evaluators and LangChain's HuggingFace datasets, you can use the
-load_evaluators and load_dataset functions, respectively.
-"""
+:func:`langchain.evaluation.loading.load_evaluators` and :func:`langchain.evaluation.loading.load_datasets` functions, respectively.
+"""  # noqa: E501
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
diff --git a/tests/unit_tests/evaluation/test_loading.py b/tests/unit_tests/evaluation/test_loading.py
new file mode 100644
index 00000000000..27c538d8b3b
--- /dev/null
+++ b/tests/unit_tests/evaluation/test_loading.py
@@ -0,0 +1,16 @@
+"""Test the loading function for evalutors."""
+
+import pytest
+
+from langchain.evaluation.loading import EvaluatorType, load_evaluators
+from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+
+
+@pytest.mark.parametrize("evaluator_type", EvaluatorType)
+def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeChatModel()
+    load_evaluators([evaluator_type], llm=fake_llm)
+
+    # Test as string
+    load_evaluators([evaluator_type.value], llm=fake_llm)  # type: ignore