mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 20:16:52 +00:00
Fix make docs_build
and related scripts (#7276)
**Description: a description of the change** Fixed `make docs_build` and related scripts which caused errors. There are several changes. First, I made the build of the documentation and the API Reference into two separate commands. This is because it takes less time to build. The commands for documents are `make docs_build`, `make docs_clean`, and `make docs_linkcheck`. The commands for API Reference are `make api_docs_build`, `api_docs_clean`, and `api_docs_linkcheck`. It looked like `docs/.local_build.sh` could be used to build the documentation, so I used that. Since `.local_build.sh` was also building API Rerefence internally, I removed that process. `.local_build.sh` also added some Bash options to stop in error or so. Futher more added `cd "${SCRIPT_DIR}"` at the beginning so that the script will work no matter which directory it is executed in. `docs/api_reference/api_reference.rst` is removed, because which is generated by `docs/api_reference/create_api_rst.py`, and added it to .gitignore. Finally, the description of CONTRIBUTING.md was modified. **Issue: the issue # it fixes (if applicable)** https://github.com/hwchase17/langchain/issues/6413 **Dependencies: any dependencies required for this change** `nbdoc` was missing in group docs so it was added. I installed it with the `poetry add --group docs nbdoc` command. I am concerned if any modifications are needed to poetry.lock. I would greatly appreciate it if you could pay close attention to this file during the review. **Tag maintainer** - General / Misc / if you don't know who to tag: @baskaryan If this PR needs any additional changes, I'll be happy to make them! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -117,11 +117,11 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# Initialize the language model\n",
|
||||
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\" \n",
|
||||
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
|
||||
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
|
||||
"\n",
|
||||
"# Initialize the SerpAPIWrapper for search functionality\n",
|
||||
"#Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
||||
"# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
||||
"search = SerpAPIWrapper()\n",
|
||||
"\n",
|
||||
"# Define a list of tools offered by the agent\n",
|
||||
@@ -130,7 +130,7 @@
|
||||
" name=\"Search\",\n",
|
||||
" func=search.run,\n",
|
||||
" coroutine=search.arun,\n",
|
||||
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\"\n",
|
||||
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
@@ -143,8 +143,12 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"functions_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False)\n",
|
||||
"conversations_agent = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
|
||||
"functions_agent = initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
|
||||
")\n",
|
||||
"conversations_agent = initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -193,20 +197,20 @@
|
||||
"\n",
|
||||
"results = []\n",
|
||||
"agents = [functions_agent, conversations_agent]\n",
|
||||
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
||||
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
||||
"\n",
|
||||
"# We will only run the first 20 examples of this dataset to speed things up\n",
|
||||
"# This will lead to larger confidence intervals downstream.\n",
|
||||
"batch = []\n",
|
||||
"for example in tqdm(dataset[:20]):\n",
|
||||
" batch.extend([agent.acall(example['inputs']) for agent in agents])\n",
|
||||
" batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
|
||||
" if len(batch) >= concurrency_level:\n",
|
||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||
" results.extend(list(zip(*[iter(batch_results)]*2)))\n",
|
||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
|
||||
" batch = []\n",
|
||||
"if batch:\n",
|
||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||
" results.extend(list(zip(*[iter(batch_results)]*2)))"
|
||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -230,11 +234,12 @@
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def predict_preferences(dataset, results) -> list:\n",
|
||||
" preferences = []\n",
|
||||
"\n",
|
||||
" for example, (res_a, res_b) in zip(dataset, results):\n",
|
||||
" input_ = example['inputs']\n",
|
||||
" input_ = example[\"inputs\"]\n",
|
||||
" # Flip a coin to reduce persistent position bias\n",
|
||||
" if random.random() < 0.5:\n",
|
||||
" pred_a, pred_b = res_a, res_b\n",
|
||||
@@ -243,16 +248,16 @@
|
||||
" pred_a, pred_b = res_b, res_a\n",
|
||||
" a, b = \"b\", \"a\"\n",
|
||||
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||
" prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||
" prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||
" input=input_\n",
|
||||
" prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||
" prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||
" input=input_,\n",
|
||||
" )\n",
|
||||
" if eval_res[\"value\"] == \"A\":\n",
|
||||
" preferences.append(a)\n",
|
||||
" elif eval_res[\"value\"] == \"B\":\n",
|
||||
" preferences.append(b)\n",
|
||||
" else:\n",
|
||||
" preferences.append(None) # No preference\n",
|
||||
" preferences.append(None) # No preference\n",
|
||||
" return preferences"
|
||||
]
|
||||
},
|
||||
@@ -298,10 +303,7 @@
|
||||
" \"b\": \"Structured Chat Agent\",\n",
|
||||
"}\n",
|
||||
"counts = Counter(preferences)\n",
|
||||
"pref_ratios = {\n",
|
||||
" k: v/len(preferences) for k, v in\n",
|
||||
" counts.items()\n",
|
||||
"}\n",
|
||||
"pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
|
||||
"for k, v in pref_ratios.items():\n",
|
||||
" print(f\"{name_map.get(k)}: {v:.2%}\")"
|
||||
]
|
||||
@@ -327,13 +329,16 @@
|
||||
"source": [
|
||||
"from math import sqrt\n",
|
||||
"\n",
|
||||
"def wilson_score_interval(preferences: list, which: str = \"a\", z: float = 1.96) -> tuple:\n",
|
||||
"\n",
|
||||
"def wilson_score_interval(\n",
|
||||
" preferences: list, which: str = \"a\", z: float = 1.96\n",
|
||||
") -> tuple:\n",
|
||||
" \"\"\"Estimate the confidence interval using the Wilson score.\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
|
||||
" for more details, including when to use it and when it should not be used.\n",
|
||||
" \"\"\"\n",
|
||||
" total_preferences = preferences.count('a') + preferences.count('b')\n",
|
||||
" total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
|
||||
" n_s = preferences.count(which)\n",
|
||||
"\n",
|
||||
" if total_preferences == 0:\n",
|
||||
@@ -342,8 +347,11 @@
|
||||
" p_hat = n_s / total_preferences\n",
|
||||
"\n",
|
||||
" denominator = 1 + (z**2) / total_preferences\n",
|
||||
" adjustment = (z / denominator) * sqrt(p_hat*(1-p_hat)/total_preferences + (z**2)/(4*total_preferences*total_preferences))\n",
|
||||
" center = (p_hat + (z**2) / (2*total_preferences)) / denominator\n",
|
||||
" adjustment = (z / denominator) * sqrt(\n",
|
||||
" p_hat * (1 - p_hat) / total_preferences\n",
|
||||
" + (z**2) / (4 * total_preferences * total_preferences)\n",
|
||||
" )\n",
|
||||
" center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
|
||||
" lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
|
||||
" upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
|
||||
"\n",
|
||||
@@ -369,7 +377,9 @@
|
||||
"source": [
|
||||
"for which_, name in name_map.items():\n",
|
||||
" low, high = wilson_score_interval(preferences, which=which_)\n",
|
||||
" print(f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).')"
|
||||
" print(\n",
|
||||
" f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -398,13 +408,16 @@
|
||||
],
|
||||
"source": [
|
||||
"from scipy import stats\n",
|
||||
"\n",
|
||||
"preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
|
||||
"successes = preferences.count(preferred_model)\n",
|
||||
"n = len(preferences) - preferences.count(None)\n",
|
||||
"p_value = stats.binom_test(successes, n, p=0.5, alternative='two-sided')\n",
|
||||
"print(f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||
"p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
|
||||
"print(\n",
|
||||
" f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||
"then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
|
||||
"times out of {n} trials.\"\"\")"
|
||||
"times out of {n} trials.\"\"\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@@ -54,7 +54,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = ChatOpenAI(temperature=0)\n",
|
||||
"query=\"What's the origin of the term synecdoche?\"\n",
|
||||
"query = \"What's the origin of the term synecdoche?\"\n",
|
||||
"prediction = llm.predict(query)"
|
||||
]
|
||||
},
|
||||
@@ -151,19 +151,22 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\", requires_reference=True)\n",
|
||||
"eval_chain = CriteriaEvalChain.from_llm(\n",
|
||||
" llm=llm, criteria=\"correctness\", requires_reference=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# We can even override the model's learned knowledge using ground truth labels\n",
|
||||
"eval_result = eval_chain.evaluate_strings(\n",
|
||||
" input=\"What is the capital of the US?\",\n",
|
||||
" prediction=\"Topeka, KS\", \n",
|
||||
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\")\n",
|
||||
" prediction=\"Topeka, KS\",\n",
|
||||
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
|
||||
")\n",
|
||||
"print(f'With ground truth: {eval_result[\"score\"]}')\n",
|
||||
"\n",
|
||||
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\")\n",
|
||||
"eval_result = eval_chain.evaluate_strings(\n",
|
||||
" input=\"What is the capital of the US?\",\n",
|
||||
" prediction=\"Topeka, KS\", \n",
|
||||
" prediction=\"Topeka, KS\",\n",
|
||||
")\n",
|
||||
"print(f'Withoutg ground truth: {eval_result[\"score\"]}')"
|
||||
]
|
||||
@@ -230,9 +233,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"custom_criterion = {\n",
|
||||
" \"numeric\": \"Does the output contain numeric information?\"\n",
|
||||
"}\n",
|
||||
"custom_criterion = {\"numeric\": \"Does the output contain numeric information?\"}\n",
|
||||
"\n",
|
||||
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n",
|
||||
"eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
|
||||
@@ -269,11 +270,17 @@
|
||||
"\n",
|
||||
"# Example that complies\n",
|
||||
"query = \"What's the population of lagos?\"\n",
|
||||
"eval_result = eval_chain.evaluate_strings(prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\", input=query)\n",
|
||||
"eval_result = eval_chain.evaluate_strings(\n",
|
||||
" prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\",\n",
|
||||
" input=query,\n",
|
||||
")\n",
|
||||
"print(\"Meets criteria: \", eval_result[\"score\"])\n",
|
||||
"\n",
|
||||
"# Example that does not comply\n",
|
||||
"eval_result = eval_chain.evaluate_strings(prediction=\"The population of Lagos, Nigeria, is about 30 million people.\", input=query)\n",
|
||||
"eval_result = eval_chain.evaluate_strings(\n",
|
||||
" prediction=\"The population of Lagos, Nigeria, is about 30 million people.\",\n",
|
||||
" input=query,\n",
|
||||
")\n",
|
||||
"print(\"Does not meet criteria: \", eval_result[\"score\"])"
|
||||
]
|
||||
},
|
||||
@@ -350,8 +357,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]])\n",
|
||||
"eval_result = eval_chain.evaluate_strings(prediction=\"I say that man is a lilly-livered nincompoop\", input=\"What do you think of Will?\")\n",
|
||||
"eval_chain = CriteriaEvalChain.from_llm(\n",
|
||||
" llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]]\n",
|
||||
")\n",
|
||||
"eval_result = eval_chain.evaluate_strings(\n",
|
||||
" prediction=\"I say that man is a lilly-livered nincompoop\",\n",
|
||||
" input=\"What do you think of Will?\",\n",
|
||||
")\n",
|
||||
"eval_result"
|
||||
]
|
||||
},
|
||||
|
@@ -339,9 +339,9 @@
|
||||
" agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
|
||||
" reference=(\n",
|
||||
" \"You need many more than 100,000 ping-pong balls in the empire state building.\"\n",
|
||||
" )\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
|
||||
"print(\"Reasoning: \", evaluation[\"reasoning\"])"
|
||||
|
Reference in New Issue
Block a user