mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-06 09:10:27 +00:00
Compare commits
65 Commits
langchain-
...
jacob/temp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef0965ec36 | ||
|
|
8a71f1b41b | ||
|
|
8e3e532e7d | ||
|
|
12e490ea56 | ||
|
|
498a482e76 | ||
|
|
d324fd1821 | ||
|
|
4bd005adb6 | ||
|
|
e01c6789c4 | ||
|
|
dd2d094adc | ||
|
|
6b98207eda | ||
|
|
c5bf114c0f | ||
|
|
015ab91b83 | ||
|
|
5a3aaae6dc | ||
|
|
75c3c81b8c | ||
|
|
0f7b8adddf | ||
|
|
09c0823c3a | ||
|
|
32f5147523 | ||
|
|
4255a30f20 | ||
|
|
49dea06af1 | ||
|
|
937b3904eb | ||
|
|
bda3becbe7 | ||
|
|
f6e6a17878 | ||
|
|
c1bd4e05bc | ||
|
|
a2e90a5a43 | ||
|
|
a06818a654 | ||
|
|
df98552b6f | ||
|
|
b83f1eb0d5 | ||
|
|
9f0c76bf89 | ||
|
|
01ecd0acba | ||
|
|
1fd1c1dca5 | ||
|
|
253ceca76a | ||
|
|
e18511bb22 | ||
|
|
34da8be60b | ||
|
|
b297af5482 | ||
|
|
4cdaca67dc | ||
|
|
d72a08a60d | ||
|
|
8eb63a609e | ||
|
|
5150ec3a04 | ||
|
|
2b4fbcb4b4 | ||
|
|
eb3870e9d8 | ||
|
|
75ae585deb | ||
|
|
b7c070d437 | ||
|
|
60b65528c5 | ||
|
|
2ef9d12372 | ||
|
|
6910b0b3aa | ||
|
|
831708beb7 | ||
|
|
a114255b82 | ||
|
|
6f68c8d6ab | ||
|
|
8afbab4cf6 | ||
|
|
66e30efa61 | ||
|
|
ba167dc158 | ||
|
|
44f69063b1 | ||
|
|
f18b77fd59 | ||
|
|
966b408634 | ||
|
|
bd261456f6 | ||
|
|
ec8ffc8f40 | ||
|
|
2494cecabf | ||
|
|
df632b8cde | ||
|
|
1050e890c6 | ||
|
|
c4779f5b9c | ||
|
|
0a99935794 | ||
|
|
63aba3fe5b | ||
|
|
dc80be5efe | ||
|
|
ab29ee79a3 | ||
|
|
1d3f7231b8 |
7
.github/scripts/check_diff.py
vendored
7
.github/scripts/check_diff.py
vendored
@@ -68,6 +68,13 @@ def dependents_graph() -> dict:
|
||||
|
||||
if "langchain" in dep:
|
||||
dependents[dep].add(pkg_dir)
|
||||
|
||||
# remove huggingface from dependents because of CI instability
|
||||
# specifically in huggingface jobs
|
||||
# https://github.com/langchain-ai/langchain/issues/25558
|
||||
for k in dependents:
|
||||
if "libs/partners/huggingface" in dependents[k]:
|
||||
dependents[k].remove("libs/partners/huggingface")
|
||||
return dependents
|
||||
|
||||
|
||||
|
||||
@@ -281,7 +281,7 @@ def _construct_doc(
|
||||
module_doc = f"""\
|
||||
.. currentmodule:: {package_namespace}
|
||||
|
||||
.. _{module}:
|
||||
.. _{package_namespace}_{module}:
|
||||
"""
|
||||
_members = members_by_namespace[module]
|
||||
classes = [
|
||||
@@ -317,8 +317,8 @@ def _construct_doc(
|
||||
"""
|
||||
|
||||
index_autosummary += f"""
|
||||
:ref:`{module}`
|
||||
{'^' * (len(module) + 5)}
|
||||
:ref:`{package_namespace}_{module}`
|
||||
{'^' * (len(package_namespace) + len(module) + 8)}
|
||||
"""
|
||||
|
||||
if classes:
|
||||
@@ -394,7 +394,7 @@ def _construct_doc(
|
||||
"""
|
||||
|
||||
index_autosummary += """
|
||||
**Deprecated classes*
|
||||
**Deprecated classes**
|
||||
|
||||
.. autosummary::
|
||||
"""
|
||||
@@ -438,7 +438,7 @@ def _construct_doc(
|
||||
{fstring}
|
||||
|
||||
"""
|
||||
index_autosummary += """
|
||||
index_autosummary += f"""
|
||||
**Deprecated functions**
|
||||
|
||||
.. autosummary::
|
||||
@@ -448,7 +448,6 @@ def _construct_doc(
|
||||
"""
|
||||
docs.append((f"{module}.rst", module_doc))
|
||||
docs.append(("index.rst", index_doc + index_autosummary))
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
@@ -530,47 +529,7 @@ def _build_index(dirs: List[str]) -> None:
|
||||
ordered = ["core", "langchain", "text-splitters", "community", "experimental"]
|
||||
main_ = [dir_ for dir_ in ordered if dir_ in dirs]
|
||||
integrations = sorted(dir_ for dir_ in dirs if dir_ not in main_)
|
||||
main_headers = [
|
||||
" ".join(custom_names.get(x, x.title()) for x in dir_.split("-"))
|
||||
for dir_ in main_
|
||||
]
|
||||
integration_headers = [
|
||||
" ".join(
|
||||
custom_names.get(x, x.title().replace("ai", "AI").replace("db", "DB"))
|
||||
for x in dir_.split("-")
|
||||
)
|
||||
for dir_ in integrations
|
||||
]
|
||||
main_tree = "\n".join(
|
||||
f"{header_name}<{dir_.replace('-', '_')}/index>"
|
||||
for header_name, dir_ in zip(main_headers, main_)
|
||||
)
|
||||
main_grid = "\n".join(
|
||||
f'- header: "**{header_name}**"\n content: "{_package_namespace(dir_).replace("_", "-")}: {_get_package_version(_package_dir(dir_))}"\n link: {dir_.replace("-", "_")}/index.html'
|
||||
for header_name, dir_ in zip(main_headers, main_)
|
||||
)
|
||||
integration_tree = "\n".join(
|
||||
f"{header_name}<{dir_.replace('-', '_')}/index>"
|
||||
for header_name, dir_ in zip(integration_headers, integrations)
|
||||
)
|
||||
|
||||
integration_grid = ""
|
||||
integrations_to_show = [
|
||||
"openai",
|
||||
"anthropic",
|
||||
"google-vertexai",
|
||||
"aws",
|
||||
"huggingface",
|
||||
"mistralai",
|
||||
]
|
||||
for header_name, dir_ in sorted(
|
||||
zip(integration_headers, integrations),
|
||||
key=lambda h_d: integrations_to_show.index(h_d[1])
|
||||
if h_d[1] in integrations_to_show
|
||||
else len(integrations_to_show),
|
||||
)[: len(integrations_to_show)]:
|
||||
integration_grid += f'\n- header: "**{header_name}**"\n content: {_package_namespace(dir_).replace("_", "-")} {_get_package_version(_package_dir(dir_))}\n link: {dir_.replace("-", "_")}/index.html'
|
||||
doc = f"""# LangChain Python API Reference
|
||||
doc = """# LangChain Python API Reference
|
||||
|
||||
Welcome to the LangChain Python API reference. This is a reference for all
|
||||
`langchain-x` packages.
|
||||
@@ -578,8 +537,22 @@ Welcome to the LangChain Python API reference. This is a reference for all
|
||||
For user guides see [https://python.langchain.com](https://python.langchain.com).
|
||||
|
||||
For the legacy API reference hosted on ReadTheDocs see [https://api.python.langchain.com/](https://api.python.langchain.com/).
|
||||
"""
|
||||
|
||||
## Base packages
|
||||
if main_:
|
||||
main_headers = [
|
||||
" ".join(custom_names.get(x, x.title()) for x in dir_.split("-"))
|
||||
for dir_ in main_
|
||||
]
|
||||
main_tree = "\n".join(
|
||||
f"{header_name}<{dir_.replace('-', '_')}/index>"
|
||||
for header_name, dir_ in zip(main_headers, main_)
|
||||
)
|
||||
main_grid = "\n".join(
|
||||
f'- header: "**{header_name}**"\n content: "{_package_namespace(dir_).replace("_", "-")}: {_get_package_version(_package_dir(dir_))}"\n link: {dir_.replace("-", "_")}/index.html'
|
||||
for header_name, dir_ in zip(main_headers, main_)
|
||||
)
|
||||
doc += f"""## Base packages
|
||||
|
||||
```{{gallery-grid}}
|
||||
:grid-columns: "1 2 2 3"
|
||||
@@ -594,8 +567,37 @@ For the legacy API reference hosted on ReadTheDocs see [https://api.python.langc
|
||||
|
||||
{main_tree}
|
||||
```
|
||||
"""
|
||||
if integrations:
|
||||
integration_headers = [
|
||||
" ".join(
|
||||
custom_names.get(x, x.title().replace("ai", "AI").replace("db", "DB"))
|
||||
for x in dir_.split("-")
|
||||
)
|
||||
for dir_ in integrations
|
||||
]
|
||||
integration_tree = "\n".join(
|
||||
f"{header_name}<{dir_.replace('-', '_')}/index>"
|
||||
for header_name, dir_ in zip(integration_headers, integrations)
|
||||
)
|
||||
|
||||
## Integrations
|
||||
integration_grid = ""
|
||||
integrations_to_show = [
|
||||
"openai",
|
||||
"anthropic",
|
||||
"google-vertexai",
|
||||
"aws",
|
||||
"huggingface",
|
||||
"mistralai",
|
||||
]
|
||||
for header_name, dir_ in sorted(
|
||||
zip(integration_headers, integrations),
|
||||
key=lambda h_d: integrations_to_show.index(h_d[1])
|
||||
if h_d[1] in integrations_to_show
|
||||
else len(integrations_to_show),
|
||||
)[: len(integrations_to_show)]:
|
||||
integration_grid += f'\n- header: "**{header_name}**"\n content: {_package_namespace(dir_).replace("_", "-")} {_get_package_version(_package_dir(dir_))}\n link: {dir_.replace("-", "_")}/index.html'
|
||||
doc += f"""## Integrations
|
||||
|
||||
```{{gallery-grid}}
|
||||
:grid-columns: "1 2 2 3"
|
||||
@@ -612,7 +614,6 @@ See the full list of integrations in the Section Navigation.
|
||||
|
||||
{integration_tree}
|
||||
```
|
||||
|
||||
"""
|
||||
with open(HERE / "reference.md", "w") as f:
|
||||
f.write(doc)
|
||||
|
||||
@@ -4,8 +4,11 @@ LangChain implements the latest research in the field of Natural Language Proces
|
||||
This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
|
||||
Templates, and Cookbooks.
|
||||
|
||||
From the opposite direction, scientists use LangChain in research and reference LangChain in the research papers.
|
||||
Here you find [such papers](https://arxiv.org/search/?query=langchain&searchtype=all&source=header).
|
||||
From the opposite direction, scientists use `LangChain` in research and reference it in the research papers.
|
||||
Here you find papers that reference:
|
||||
- [LangChain](https://arxiv.org/search/?query=langchain&searchtype=all&source=header)
|
||||
- [LangGraph](https://arxiv.org/search/?query=langgraph&searchtype=all&source=header)
|
||||
- [LangSmith](https://arxiv.org/search/?query=langsmith&searchtype=all&source=header)
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -23,32 +26,30 @@ Here you find [such papers](https://arxiv.org/search/?query=langchain&searchtype
|
||||
| `2305.14283v3` [Query Rewriting for Retrieval-Augmented Large Language Models](http://arxiv.org/abs/2305.14283v3) | Xinbei Ma, Yeyun Gong, Pengcheng He, et al. | 2023-05-23 | `Template:` [rewrite-retrieve-read](https://python.langchain.com/docs/templates/rewrite-retrieve-read), `Cookbook:` [rewrite](https://github.com/langchain-ai/langchain/blob/master/cookbook/rewrite.ipynb)
|
||||
| `2305.08291v1` [Large Language Model Guided Tree-of-Thought](http://arxiv.org/abs/2305.08291v1) | Jieyi Long | 2023-05-15 | `API:` [langchain_experimental.tot](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.tot), `Cookbook:` [tree_of_thought](https://github.com/langchain-ai/langchain/blob/master/cookbook/tree_of_thought.ipynb)
|
||||
| `2305.04091v3` [Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models](http://arxiv.org/abs/2305.04091v3) | Lei Wang, Wanyu Xu, Yihuai Lan, et al. | 2023-05-06 | `Cookbook:` [plan_and_execute_agent](https://github.com/langchain-ai/langchain/blob/master/cookbook/plan_and_execute_agent.ipynb)
|
||||
| `2305.02156v1` [Zero-Shot Listwise Document Reranking with a Large Language Model](http://arxiv.org/abs/2305.02156v1) | Xueguang Ma, Xinyu Zhang, Ronak Pradeep, et al. | 2023-05-03 | `API:` [langchain...LLMListwiseRerank](https://api.python.langchain.com/en/latest/retrievers/langchain.retrievers.document_compressors.listwise_rerank.LLMListwiseRerank.html#langchain.retrievers.document_compressors.listwise_rerank.LLMListwiseRerank)
|
||||
| `2304.08485v2` [Visual Instruction Tuning](http://arxiv.org/abs/2304.08485v2) | Haotian Liu, Chunyuan Li, Qingyang Wu, et al. | 2023-04-17 | `Cookbook:` [Semi_structured_and_multi_modal_RAG](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb), [Semi_structured_multi_modal_RAG_LLaMA2](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_multi_modal_RAG_LLaMA2.ipynb)
|
||||
| `2304.03442v2` [Generative Agents: Interactive Simulacra of Human Behavior](http://arxiv.org/abs/2304.03442v2) | Joon Sung Park, Joseph C. O'Brien, Carrie J. Cai, et al. | 2023-04-07 | `Cookbook:` [multiagent_bidding](https://github.com/langchain-ai/langchain/blob/master/cookbook/multiagent_bidding.ipynb), [generative_agents_interactive_simulacra_of_human_behavior](https://github.com/langchain-ai/langchain/blob/master/cookbook/generative_agents_interactive_simulacra_of_human_behavior.ipynb)
|
||||
| `2303.17760v2` [CAMEL: Communicative Agents for "Mind" Exploration of Large Language Model Society](http://arxiv.org/abs/2303.17760v2) | Guohao Li, Hasan Abed Al Kader Hammoud, Hani Itani, et al. | 2023-03-31 | `Cookbook:` [camel_role_playing](https://github.com/langchain-ai/langchain/blob/master/cookbook/camel_role_playing.ipynb)
|
||||
| `2303.17580v4` [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face](http://arxiv.org/abs/2303.17580v4) | Yongliang Shen, Kaitao Song, Xu Tan, et al. | 2023-03-30 | `API:` [langchain_experimental.autonomous_agents](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.autonomous_agents), `Cookbook:` [hugginggpt](https://github.com/langchain-ai/langchain/blob/master/cookbook/hugginggpt.ipynb)
|
||||
| `2303.08774v6` [GPT-4 Technical Report](http://arxiv.org/abs/2303.08774v6) | OpenAI, Josh Achiam, Steven Adler, et al. | 2023-03-15 | `Docs:` [docs/integrations/vectorstores/mongodb_atlas](https://python.langchain.com/docs/integrations/vectorstores/mongodb_atlas)
|
||||
| `2301.10226v4` [A Watermark for Large Language Models](http://arxiv.org/abs/2301.10226v4) | John Kirchenbauer, Jonas Geiping, Yuxin Wen, et al. | 2023-01-24 | `API:` [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
| `2301.10226v4` [A Watermark for Large Language Models](http://arxiv.org/abs/2301.10226v4) | John Kirchenbauer, Jonas Geiping, Yuxin Wen, et al. | 2023-01-24 | `API:` [langchain_community...OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
| `2212.10496v1` [Precise Zero-Shot Dense Retrieval without Relevance Labels](http://arxiv.org/abs/2212.10496v1) | Luyu Gao, Xueguang Ma, Jimmy Lin, et al. | 2022-12-20 | `API:` [langchain...HypotheticalDocumentEmbedder](https://api.python.langchain.com/en/latest/chains/langchain.chains.hyde.base.HypotheticalDocumentEmbedder.html#langchain.chains.hyde.base.HypotheticalDocumentEmbedder), `Template:` [hyde](https://python.langchain.com/docs/templates/hyde), `Cookbook:` [hypothetical_document_embeddings](https://github.com/langchain-ai/langchain/blob/master/cookbook/hypothetical_document_embeddings.ipynb)
|
||||
| `2212.07425v3` [Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments](http://arxiv.org/abs/2212.07425v3) | Zhivar Sourati, Vishnu Priya Prasanna Venkatesh, Darshan Deshpande, et al. | 2022-12-12 | `API:` [langchain_experimental.fallacy_removal](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.fallacy_removal)
|
||||
| `2211.13892v2` [Complementary Explanations for Effective In-Context Learning](http://arxiv.org/abs/2211.13892v2) | Xi Ye, Srinivasan Iyer, Asli Celikyilmaz, et al. | 2022-11-25 | `API:` [langchain_core...MaxMarginalRelevanceExampleSelector](https://api.python.langchain.com/en/latest/example_selectors/langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector.html#langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector)
|
||||
| `2211.10435v2` [PAL: Program-aided Language Models](http://arxiv.org/abs/2211.10435v2) | Luyu Gao, Aman Madaan, Shuyan Zhou, et al. | 2022-11-18 | `API:` [langchain_experimental...PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain), [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain), `Cookbook:` [program_aided_language_model](https://github.com/langchain-ai/langchain/blob/master/cookbook/program_aided_language_model.ipynb)
|
||||
| `2210.03629v3` [ReAct: Synergizing Reasoning and Acting in Language Models](http://arxiv.org/abs/2210.03629v3) | Shunyu Yao, Jeffrey Zhao, Dian Yu, et al. | 2022-10-06 | `Docs:` [docs/integrations/providers/cohere](https://python.langchain.com/docs/integrations/providers/cohere), [docs/integrations/chat/huggingface](https://python.langchain.com/docs/integrations/chat/huggingface), [docs/integrations/tools/ionic_shopping](https://python.langchain.com/docs/integrations/tools/ionic_shopping), `API:` [langchain...create_react_agent](https://api.python.langchain.com/en/latest/agents/langchain.agents.react.agent.create_react_agent.html#langchain.agents.react.agent.create_react_agent), [langchain...TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain)
|
||||
| `2211.10435v2` [PAL: Program-aided Language Models](http://arxiv.org/abs/2211.10435v2) | Luyu Gao, Aman Madaan, Shuyan Zhou, et al. | 2022-11-18 | `API:` [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain), [langchain_experimental...PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain), `Cookbook:` [program_aided_language_model](https://github.com/langchain-ai/langchain/blob/master/cookbook/program_aided_language_model.ipynb)
|
||||
| `2210.03629v3` [ReAct: Synergizing Reasoning and Acting in Language Models](http://arxiv.org/abs/2210.03629v3) | Shunyu Yao, Jeffrey Zhao, Dian Yu, et al. | 2022-10-06 | `Docs:` [docs/integrations/providers/cohere](https://python.langchain.com/docs/integrations/providers/cohere), [docs/integrations/tools/ionic_shopping](https://python.langchain.com/docs/integrations/tools/ionic_shopping), `API:` [langchain...TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain), [langchain...create_react_agent](https://api.python.langchain.com/en/latest/agents/langchain.agents.react.agent.create_react_agent.html#langchain.agents.react.agent.create_react_agent)
|
||||
| `2209.10785v2` [Deep Lake: a Lakehouse for Deep Learning](http://arxiv.org/abs/2209.10785v2) | Sasun Hambardzumyan, Abhinav Tuli, Levon Ghukasyan, et al. | 2022-09-22 | `Docs:` [docs/integrations/providers/activeloop_deeplake](https://python.langchain.com/docs/integrations/providers/activeloop_deeplake)
|
||||
| `2205.13147v4` [Matryoshka Representation Learning](http://arxiv.org/abs/2205.13147v4) | Aditya Kusupati, Gantavya Bhatt, Aniket Rege, et al. | 2022-05-26 | `Docs:` [docs/integrations/providers/snowflake](https://python.langchain.com/docs/integrations/providers/snowflake)
|
||||
| `2205.12654v1` [Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages](http://arxiv.org/abs/2205.12654v1) | Kevin Heffernan, Onur Çelebi, Holger Schwenk | 2022-05-25 | `API:` [langchain_community...LaserEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.laser.LaserEmbeddings.html#langchain_community.embeddings.laser.LaserEmbeddings)
|
||||
| `2204.00498v1` [Evaluating the Text-to-SQL Capabilities of Large Language Models](http://arxiv.org/abs/2204.00498v1) | Nitarshan Rajkumar, Raymond Li, Dzmitry Bahdanau | 2022-03-15 | `API:` [langchain_community...SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL), [langchain_community...SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase)
|
||||
| `2202.00666v5` [Locally Typical Sampling](http://arxiv.org/abs/2202.00666v5) | Clara Meister, Tiago Pimentel, Gian Wiher, et al. | 2022-02-01 | `API:` [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
| `2204.00498v1` [Evaluating the Text-to-SQL Capabilities of Large Language Models](http://arxiv.org/abs/2204.00498v1) | Nitarshan Rajkumar, Raymond Li, Dzmitry Bahdanau | 2022-03-15 | `API:` [langchain_community...SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase), [langchain_community...SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL)
|
||||
| `2202.00666v5` [Locally Typical Sampling](http://arxiv.org/abs/2202.00666v5) | Clara Meister, Tiago Pimentel, Gian Wiher, et al. | 2022-02-01 | `API:` [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
| `2103.00020v1` [Learning Transferable Visual Models From Natural Language Supervision](http://arxiv.org/abs/2103.00020v1) | Alec Radford, Jong Wook Kim, Chris Hallacy, et al. | 2021-02-26 | `API:` [langchain_experimental.open_clip](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.open_clip)
|
||||
| `1909.05858v2` [CTRL: A Conditional Transformer Language Model for Controllable Generation](http://arxiv.org/abs/1909.05858v2) | Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, et al. | 2019-09-11 | `API:` [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
| `1908.10084v1` [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](http://arxiv.org/abs/1908.10084v1) | Nils Reimers, Iryna Gurevych | 2019-08-27 | `Docs:` [docs/integrations/text_embedding/sentence_transformers](https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers)
|
||||
| `1909.05858v2` [CTRL: A Conditional Transformer Language Model for Controllable Generation](http://arxiv.org/abs/1909.05858v2) | Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, et al. | 2019-09-11 | `API:` [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
|
||||
## Self-Discover: Large Language Models Self-Compose Reasoning Structures
|
||||
|
||||
- **arXiv id:** 2402.03620v1
|
||||
- **arXiv id:** [2402.03620v1](http://arxiv.org/abs/2402.03620v1) **Published Date:** 2024-02-06
|
||||
- **Title:** Self-Discover: Large Language Models Self-Compose Reasoning Structures
|
||||
- **Authors:** Pei Zhou, Jay Pujara, Xiang Ren, et al.
|
||||
- **Published Date:** 2024-02-06
|
||||
- **URL:** http://arxiv.org/abs/2402.03620v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [self-discover](https://github.com/langchain-ai/langchain/blob/master/cookbook/self-discover.ipynb)
|
||||
@@ -70,11 +71,9 @@ commonalities with human reasoning patterns.
|
||||
|
||||
## RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval
|
||||
|
||||
- **arXiv id:** 2401.18059v1
|
||||
- **arXiv id:** [2401.18059v1](http://arxiv.org/abs/2401.18059v1) **Published Date:** 2024-01-31
|
||||
- **Title:** RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval
|
||||
- **Authors:** Parth Sarthi, Salman Abdullah, Aditi Tuli, et al.
|
||||
- **Published Date:** 2024-01-31
|
||||
- **URL:** http://arxiv.org/abs/2401.18059v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [RAPTOR](https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb)
|
||||
@@ -96,11 +95,9 @@ benchmark by 20% in absolute accuracy.
|
||||
|
||||
## Corrective Retrieval Augmented Generation
|
||||
|
||||
- **arXiv id:** 2401.15884v2
|
||||
- **arXiv id:** [2401.15884v2](http://arxiv.org/abs/2401.15884v2) **Published Date:** 2024-01-29
|
||||
- **Title:** Corrective Retrieval Augmented Generation
|
||||
- **Authors:** Shi-Qi Yan, Jia-Chen Gu, Yun Zhu, et al.
|
||||
- **Published Date:** 2024-01-29
|
||||
- **URL:** http://arxiv.org/abs/2401.15884v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [langgraph_crag](https://github.com/langchain-ai/langchain/blob/master/cookbook/langgraph_crag.ipynb)
|
||||
@@ -126,11 +123,9 @@ performance of RAG-based approaches.
|
||||
|
||||
## Mixtral of Experts
|
||||
|
||||
- **arXiv id:** 2401.04088v1
|
||||
- **arXiv id:** [2401.04088v1](http://arxiv.org/abs/2401.04088v1) **Published Date:** 2024-01-08
|
||||
- **Title:** Mixtral of Experts
|
||||
- **Authors:** Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, et al.
|
||||
- **Published Date:** 2024-01-08
|
||||
- **URL:** http://arxiv.org/abs/2401.04088v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [together_ai](https://github.com/langchain-ai/langchain/blob/master/cookbook/together_ai.ipynb)
|
||||
@@ -152,11 +147,9 @@ the base and instruct models are released under the Apache 2.0 license.
|
||||
|
||||
## Dense X Retrieval: What Retrieval Granularity Should We Use?
|
||||
|
||||
- **arXiv id:** 2312.06648v2
|
||||
- **arXiv id:** [2312.06648v2](http://arxiv.org/abs/2312.06648v2) **Published Date:** 2023-12-11
|
||||
- **Title:** Dense X Retrieval: What Retrieval Granularity Should We Use?
|
||||
- **Authors:** Tong Chen, Hongwei Wang, Sihao Chen, et al.
|
||||
- **Published Date:** 2023-12-11
|
||||
- **URL:** http://arxiv.org/abs/2312.06648v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Template:** [propositional-retrieval](https://python.langchain.com/docs/templates/propositional-retrieval)
|
||||
@@ -181,11 +174,9 @@ information.
|
||||
|
||||
## Chain-of-Note: Enhancing Robustness in Retrieval-Augmented Language Models
|
||||
|
||||
- **arXiv id:** 2311.09210v1
|
||||
- **arXiv id:** [2311.09210v1](http://arxiv.org/abs/2311.09210v1) **Published Date:** 2023-11-15
|
||||
- **Title:** Chain-of-Note: Enhancing Robustness in Retrieval-Augmented Language Models
|
||||
- **Authors:** Wenhao Yu, Hongming Zhang, Xiaoman Pan, et al.
|
||||
- **Published Date:** 2023-11-15
|
||||
- **URL:** http://arxiv.org/abs/2311.09210v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Template:** [chain-of-note-wiki](https://python.langchain.com/docs/templates/chain-of-note-wiki)
|
||||
@@ -215,11 +206,9 @@ outside the pre-training knowledge scope.
|
||||
|
||||
## Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection
|
||||
|
||||
- **arXiv id:** 2310.11511v1
|
||||
- **arXiv id:** [2310.11511v1](http://arxiv.org/abs/2310.11511v1) **Published Date:** 2023-10-17
|
||||
- **Title:** Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection
|
||||
- **Authors:** Akari Asai, Zeqiu Wu, Yizhong Wang, et al.
|
||||
- **Published Date:** 2023-10-17
|
||||
- **URL:** http://arxiv.org/abs/2310.11511v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [langgraph_self_rag](https://github.com/langchain-ai/langchain/blob/master/cookbook/langgraph_self_rag.ipynb)
|
||||
@@ -248,11 +237,9 @@ to these models.
|
||||
|
||||
## Take a Step Back: Evoking Reasoning via Abstraction in Large Language Models
|
||||
|
||||
- **arXiv id:** 2310.06117v2
|
||||
- **arXiv id:** [2310.06117v2](http://arxiv.org/abs/2310.06117v2) **Published Date:** 2023-10-09
|
||||
- **Title:** Take a Step Back: Evoking Reasoning via Abstraction in Large Language Models
|
||||
- **Authors:** Huaixiu Steven Zheng, Swaroop Mishra, Xinyun Chen, et al.
|
||||
- **Published Date:** 2023-10-09
|
||||
- **URL:** http://arxiv.org/abs/2310.06117v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Template:** [stepback-qa-prompting](https://python.langchain.com/docs/templates/stepback-qa-prompting)
|
||||
@@ -271,11 +258,9 @@ and 11% respectively, TimeQA by 27%, and MuSiQue by 7%.
|
||||
|
||||
## Llama 2: Open Foundation and Fine-Tuned Chat Models
|
||||
|
||||
- **arXiv id:** 2307.09288v2
|
||||
- **arXiv id:** [2307.09288v2](http://arxiv.org/abs/2307.09288v2) **Published Date:** 2023-07-18
|
||||
- **Title:** Llama 2: Open Foundation and Fine-Tuned Chat Models
|
||||
- **Authors:** Hugo Touvron, Louis Martin, Kevin Stone, et al.
|
||||
- **Published Date:** 2023-07-18
|
||||
- **URL:** http://arxiv.org/abs/2307.09288v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [Semi_Structured_RAG](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_Structured_RAG.ipynb)
|
||||
@@ -292,11 +277,9 @@ contribute to the responsible development of LLMs.
|
||||
|
||||
## Query Rewriting for Retrieval-Augmented Large Language Models
|
||||
|
||||
- **arXiv id:** 2305.14283v3
|
||||
- **arXiv id:** [2305.14283v3](http://arxiv.org/abs/2305.14283v3) **Published Date:** 2023-05-23
|
||||
- **Title:** Query Rewriting for Retrieval-Augmented Large Language Models
|
||||
- **Authors:** Xinbei Ma, Yeyun Gong, Pengcheng He, et al.
|
||||
- **Published Date:** 2023-05-23
|
||||
- **URL:** http://arxiv.org/abs/2305.14283v3
|
||||
- **LangChain:**
|
||||
|
||||
- **Template:** [rewrite-retrieve-read](https://python.langchain.com/docs/templates/rewrite-retrieve-read)
|
||||
@@ -322,11 +305,9 @@ for retrieval-augmented LLM.
|
||||
|
||||
## Large Language Model Guided Tree-of-Thought
|
||||
|
||||
- **arXiv id:** 2305.08291v1
|
||||
- **arXiv id:** [2305.08291v1](http://arxiv.org/abs/2305.08291v1) **Published Date:** 2023-05-15
|
||||
- **Title:** Large Language Model Guided Tree-of-Thought
|
||||
- **Authors:** Jieyi Long
|
||||
- **Published Date:** 2023-05-15
|
||||
- **URL:** http://arxiv.org/abs/2305.08291v1
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_experimental.tot](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.tot)
|
||||
@@ -352,11 +333,9 @@ implementation of the ToT-based Sudoku solver is available on GitHub:
|
||||
|
||||
## Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models
|
||||
|
||||
- **arXiv id:** 2305.04091v3
|
||||
- **arXiv id:** [2305.04091v3](http://arxiv.org/abs/2305.04091v3) **Published Date:** 2023-05-06
|
||||
- **Title:** Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models
|
||||
- **Authors:** Lei Wang, Wanyu Xu, Yihuai Lan, et al.
|
||||
- **Published Date:** 2023-05-06
|
||||
- **URL:** http://arxiv.org/abs/2305.04091v3
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [plan_and_execute_agent](https://github.com/langchain-ai/langchain/blob/master/cookbook/plan_and_execute_agent.ipynb)
|
||||
@@ -383,13 +362,35 @@ Prompting, and has comparable performance with 8-shot CoT prompting on the math
|
||||
reasoning problem. The code can be found at
|
||||
https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting.
|
||||
|
||||
## Zero-Shot Listwise Document Reranking with a Large Language Model
|
||||
|
||||
- **arXiv id:** [2305.02156v1](http://arxiv.org/abs/2305.02156v1) **Published Date:** 2023-05-03
|
||||
- **Title:** Zero-Shot Listwise Document Reranking with a Large Language Model
|
||||
- **Authors:** Xueguang Ma, Xinyu Zhang, Ronak Pradeep, et al.
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain...LLMListwiseRerank](https://api.python.langchain.com/en/latest/retrievers/langchain.retrievers.document_compressors.listwise_rerank.LLMListwiseRerank.html#langchain.retrievers.document_compressors.listwise_rerank.LLMListwiseRerank)
|
||||
|
||||
**Abstract:** Supervised ranking methods based on bi-encoder or cross-encoder architectures
|
||||
have shown success in multi-stage text ranking tasks, but they require large
|
||||
amounts of relevance judgments as training data. In this work, we propose
|
||||
Listwise Reranker with a Large Language Model (LRL), which achieves strong
|
||||
reranking effectiveness without using any task-specific training data.
|
||||
Different from the existing pointwise ranking methods, where documents are
|
||||
scored independently and ranked according to the scores, LRL directly generates
|
||||
a reordered list of document identifiers given the candidate documents.
|
||||
Experiments on three TREC web search datasets demonstrate that LRL not only
|
||||
outperforms zero-shot pointwise methods when reranking first-stage retrieval
|
||||
results, but can also act as a final-stage reranker to improve the top-ranked
|
||||
results of a pointwise method for improved efficiency. Additionally, we apply
|
||||
our approach to subsets of MIRACL, a recent multilingual retrieval dataset,
|
||||
with results showing its potential to generalize across different languages.
|
||||
|
||||
## Visual Instruction Tuning
|
||||
|
||||
- **arXiv id:** 2304.08485v2
|
||||
- **arXiv id:** [2304.08485v2](http://arxiv.org/abs/2304.08485v2) **Published Date:** 2023-04-17
|
||||
- **Title:** Visual Instruction Tuning
|
||||
- **Authors:** Haotian Liu, Chunyuan Li, Qingyang Wu, et al.
|
||||
- **Published Date:** 2023-04-17
|
||||
- **URL:** http://arxiv.org/abs/2304.08485v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [Semi_structured_and_multi_modal_RAG](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb), [Semi_structured_multi_modal_RAG_LLaMA2](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_multi_modal_RAG_LLaMA2.ipynb)
|
||||
@@ -412,11 +413,9 @@ publicly available.
|
||||
|
||||
## Generative Agents: Interactive Simulacra of Human Behavior
|
||||
|
||||
- **arXiv id:** 2304.03442v2
|
||||
- **arXiv id:** [2304.03442v2](http://arxiv.org/abs/2304.03442v2) **Published Date:** 2023-04-07
|
||||
- **Title:** Generative Agents: Interactive Simulacra of Human Behavior
|
||||
- **Authors:** Joon Sung Park, Joseph C. O'Brien, Carrie J. Cai, et al.
|
||||
- **Published Date:** 2023-04-07
|
||||
- **URL:** http://arxiv.org/abs/2304.03442v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [multiagent_bidding](https://github.com/langchain-ai/langchain/blob/master/cookbook/multiagent_bidding.ipynb), [generative_agents_interactive_simulacra_of_human_behavior](https://github.com/langchain-ai/langchain/blob/master/cookbook/generative_agents_interactive_simulacra_of_human_behavior.ipynb)
|
||||
@@ -448,11 +447,9 @@ interaction patterns for enabling believable simulations of human behavior.
|
||||
|
||||
## CAMEL: Communicative Agents for "Mind" Exploration of Large Language Model Society
|
||||
|
||||
- **arXiv id:** 2303.17760v2
|
||||
- **arXiv id:** [2303.17760v2](http://arxiv.org/abs/2303.17760v2) **Published Date:** 2023-03-31
|
||||
- **Title:** CAMEL: Communicative Agents for "Mind" Exploration of Large Language Model Society
|
||||
- **Authors:** Guohao Li, Hasan Abed Al Kader Hammoud, Hani Itani, et al.
|
||||
- **Published Date:** 2023-03-31
|
||||
- **URL:** http://arxiv.org/abs/2303.17760v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Cookbook:** [camel_role_playing](https://github.com/langchain-ai/langchain/blob/master/cookbook/camel_role_playing.ipynb)
|
||||
@@ -478,11 +475,9 @@ agents and beyond: https://github.com/camel-ai/camel.
|
||||
|
||||
## HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face
|
||||
|
||||
- **arXiv id:** 2303.17580v4
|
||||
- **arXiv id:** [2303.17580v4](http://arxiv.org/abs/2303.17580v4) **Published Date:** 2023-03-30
|
||||
- **Title:** HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face
|
||||
- **Authors:** Yongliang Shen, Kaitao Song, Xu Tan, et al.
|
||||
- **Published Date:** 2023-03-30
|
||||
- **URL:** http://arxiv.org/abs/2303.17580v4
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_experimental.autonomous_agents](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.autonomous_agents)
|
||||
@@ -508,40 +503,14 @@ modalities and domains and achieve impressive results in language, vision,
|
||||
speech, and other challenging tasks, which paves a new way towards the
|
||||
realization of artificial general intelligence.
|
||||
|
||||
## GPT-4 Technical Report
|
||||
|
||||
- **arXiv id:** 2303.08774v6
|
||||
- **Title:** GPT-4 Technical Report
|
||||
- **Authors:** OpenAI, Josh Achiam, Steven Adler, et al.
|
||||
- **Published Date:** 2023-03-15
|
||||
- **URL:** http://arxiv.org/abs/2303.08774v6
|
||||
- **LangChain:**
|
||||
|
||||
- **Documentation:** [docs/integrations/vectorstores/mongodb_atlas](https://python.langchain.com/docs/integrations/vectorstores/mongodb_atlas)
|
||||
|
||||
**Abstract:** We report the development of GPT-4, a large-scale, multimodal model which can
|
||||
accept image and text inputs and produce text outputs. While less capable than
|
||||
humans in many real-world scenarios, GPT-4 exhibits human-level performance on
|
||||
various professional and academic benchmarks, including passing a simulated bar
|
||||
exam with a score around the top 10% of test takers. GPT-4 is a
|
||||
Transformer-based model pre-trained to predict the next token in a document.
|
||||
The post-training alignment process results in improved performance on measures
|
||||
of factuality and adherence to desired behavior. A core component of this
|
||||
project was developing infrastructure and optimization methods that behave
|
||||
predictably across a wide range of scales. This allowed us to accurately
|
||||
predict some aspects of GPT-4's performance based on models trained with no
|
||||
more than 1/1,000th the compute of GPT-4.
|
||||
|
||||
## A Watermark for Large Language Models
|
||||
|
||||
- **arXiv id:** 2301.10226v4
|
||||
- **arXiv id:** [2301.10226v4](http://arxiv.org/abs/2301.10226v4) **Published Date:** 2023-01-24
|
||||
- **Title:** A Watermark for Large Language Models
|
||||
- **Authors:** John Kirchenbauer, Jonas Geiping, Yuxin Wen, et al.
|
||||
- **Published Date:** 2023-01-24
|
||||
- **URL:** http://arxiv.org/abs/2301.10226v4
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
- **API Reference:** [langchain_community...OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
|
||||
**Abstract:** Potential harms of large language models can be mitigated by watermarking
|
||||
model output, i.e., embedding signals into generated text that are invisible to
|
||||
@@ -559,11 +528,9 @@ family, and discuss robustness and security.
|
||||
|
||||
## Precise Zero-Shot Dense Retrieval without Relevance Labels
|
||||
|
||||
- **arXiv id:** 2212.10496v1
|
||||
- **arXiv id:** [2212.10496v1](http://arxiv.org/abs/2212.10496v1) **Published Date:** 2022-12-20
|
||||
- **Title:** Precise Zero-Shot Dense Retrieval without Relevance Labels
|
||||
- **Authors:** Luyu Gao, Xueguang Ma, Jimmy Lin, et al.
|
||||
- **Published Date:** 2022-12-20
|
||||
- **URL:** http://arxiv.org/abs/2212.10496v1
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain...HypotheticalDocumentEmbedder](https://api.python.langchain.com/en/latest/chains/langchain.chains.hyde.base.HypotheticalDocumentEmbedder.html#langchain.chains.hyde.base.HypotheticalDocumentEmbedder)
|
||||
@@ -590,11 +557,9 @@ search, QA, fact verification) and languages~(e.g. sw, ko, ja).
|
||||
|
||||
## Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments
|
||||
|
||||
- **arXiv id:** 2212.07425v3
|
||||
- **arXiv id:** [2212.07425v3](http://arxiv.org/abs/2212.07425v3) **Published Date:** 2022-12-12
|
||||
- **Title:** Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments
|
||||
- **Authors:** Zhivar Sourati, Vishnu Priya Prasanna Venkatesh, Darshan Deshpande, et al.
|
||||
- **Published Date:** 2022-12-12
|
||||
- **URL:** http://arxiv.org/abs/2212.07425v3
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_experimental.fallacy_removal](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.fallacy_removal)
|
||||
@@ -623,11 +588,9 @@ further work on logical fallacy identification.
|
||||
|
||||
## Complementary Explanations for Effective In-Context Learning
|
||||
|
||||
- **arXiv id:** 2211.13892v2
|
||||
- **arXiv id:** [2211.13892v2](http://arxiv.org/abs/2211.13892v2) **Published Date:** 2022-11-25
|
||||
- **Title:** Complementary Explanations for Effective In-Context Learning
|
||||
- **Authors:** Xi Ye, Srinivasan Iyer, Asli Celikyilmaz, et al.
|
||||
- **Published Date:** 2022-11-25
|
||||
- **URL:** http://arxiv.org/abs/2211.13892v2
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_core...MaxMarginalRelevanceExampleSelector](https://api.python.langchain.com/en/latest/example_selectors/langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector.html#langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector)
|
||||
@@ -651,14 +614,12 @@ performance across three real-world tasks on multiple LLMs.
|
||||
|
||||
## PAL: Program-aided Language Models
|
||||
|
||||
- **arXiv id:** 2211.10435v2
|
||||
- **arXiv id:** [2211.10435v2](http://arxiv.org/abs/2211.10435v2) **Published Date:** 2022-11-18
|
||||
- **Title:** PAL: Program-aided Language Models
|
||||
- **Authors:** Luyu Gao, Aman Madaan, Shuyan Zhou, et al.
|
||||
- **Published Date:** 2022-11-18
|
||||
- **URL:** http://arxiv.org/abs/2211.10435v2
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_experimental...PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain), [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain)
|
||||
- **API Reference:** [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain), [langchain_experimental...PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain)
|
||||
- **Cookbook:** [program_aided_language_model](https://github.com/langchain-ai/langchain/blob/master/cookbook/program_aided_language_model.ipynb)
|
||||
|
||||
**Abstract:** Large language models (LLMs) have recently demonstrated an impressive ability
|
||||
@@ -686,15 +647,13 @@ publicly available at http://reasonwithpal.com/ .
|
||||
|
||||
## ReAct: Synergizing Reasoning and Acting in Language Models
|
||||
|
||||
- **arXiv id:** 2210.03629v3
|
||||
- **arXiv id:** [2210.03629v3](http://arxiv.org/abs/2210.03629v3) **Published Date:** 2022-10-06
|
||||
- **Title:** ReAct: Synergizing Reasoning and Acting in Language Models
|
||||
- **Authors:** Shunyu Yao, Jeffrey Zhao, Dian Yu, et al.
|
||||
- **Published Date:** 2022-10-06
|
||||
- **URL:** http://arxiv.org/abs/2210.03629v3
|
||||
- **LangChain:**
|
||||
|
||||
- **Documentation:** [docs/integrations/providers/cohere](https://python.langchain.com/docs/integrations/providers/cohere), [docs/integrations/chat/huggingface](https://python.langchain.com/docs/integrations/chat/huggingface), [docs/integrations/tools/ionic_shopping](https://python.langchain.com/docs/integrations/tools/ionic_shopping)
|
||||
- **API Reference:** [langchain...create_react_agent](https://api.python.langchain.com/en/latest/agents/langchain.agents.react.agent.create_react_agent.html#langchain.agents.react.agent.create_react_agent), [langchain...TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain)
|
||||
- **Documentation:** [docs/integrations/providers/cohere](https://python.langchain.com/docs/integrations/providers/cohere), [docs/integrations/tools/ionic_shopping](https://python.langchain.com/docs/integrations/tools/ionic_shopping)
|
||||
- **API Reference:** [langchain...TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain), [langchain...create_react_agent](https://api.python.langchain.com/en/latest/agents/langchain.agents.react.agent.create_react_agent.html#langchain.agents.react.agent.create_react_agent)
|
||||
|
||||
**Abstract:** While large language models (LLMs) have demonstrated impressive capabilities
|
||||
across tasks in language understanding and interactive decision making, their
|
||||
@@ -721,11 +680,9 @@ Project site with code: https://react-lm.github.io
|
||||
|
||||
## Deep Lake: a Lakehouse for Deep Learning
|
||||
|
||||
- **arXiv id:** 2209.10785v2
|
||||
- **arXiv id:** [2209.10785v2](http://arxiv.org/abs/2209.10785v2) **Published Date:** 2022-09-22
|
||||
- **Title:** Deep Lake: a Lakehouse for Deep Learning
|
||||
- **Authors:** Sasun Hambardzumyan, Abhinav Tuli, Levon Ghukasyan, et al.
|
||||
- **Published Date:** 2022-09-22
|
||||
- **URL:** http://arxiv.org/abs/2209.10785v2
|
||||
- **LangChain:**
|
||||
|
||||
- **Documentation:** [docs/integrations/providers/activeloop_deeplake](https://python.langchain.com/docs/integrations/providers/activeloop_deeplake)
|
||||
@@ -747,13 +704,43 @@ visualization engine, or (c) deep learning frameworks without sacrificing GPU
|
||||
utilization. Datasets stored in Deep Lake can be accessed from PyTorch,
|
||||
TensorFlow, JAX, and integrate with numerous MLOps tools.
|
||||
|
||||
## Matryoshka Representation Learning
|
||||
|
||||
- **arXiv id:** [2205.13147v4](http://arxiv.org/abs/2205.13147v4) **Published Date:** 2022-05-26
|
||||
- **Title:** Matryoshka Representation Learning
|
||||
- **Authors:** Aditya Kusupati, Gantavya Bhatt, Aniket Rege, et al.
|
||||
- **LangChain:**
|
||||
|
||||
- **Documentation:** [docs/integrations/providers/snowflake](https://python.langchain.com/docs/integrations/providers/snowflake)
|
||||
|
||||
**Abstract:** Learned representations are a central component in modern ML systems, serving
|
||||
a multitude of downstream tasks. When training such representations, it is
|
||||
often the case that computational and statistical constraints for each
|
||||
downstream task are unknown. In this context rigid, fixed capacity
|
||||
representations can be either over or under-accommodating to the task at hand.
|
||||
This leads us to ask: can we design a flexible representation that can adapt to
|
||||
multiple downstream tasks with varying computational resources? Our main
|
||||
contribution is Matryoshka Representation Learning (MRL) which encodes
|
||||
information at different granularities and allows a single embedding to adapt
|
||||
to the computational constraints of downstream tasks. MRL minimally modifies
|
||||
existing representation learning pipelines and imposes no additional cost
|
||||
during inference and deployment. MRL learns coarse-to-fine representations that
|
||||
are at least as accurate and rich as independently trained low-dimensional
|
||||
representations. The flexibility within the learned Matryoshka Representations
|
||||
offer: (a) up to 14x smaller embedding size for ImageNet-1K classification at
|
||||
the same level of accuracy; (b) up to 14x real-world speed-ups for large-scale
|
||||
retrieval on ImageNet-1K and 4K; and (c) up to 2% accuracy improvements for
|
||||
long-tail few-shot classification, all while being as robust as the original
|
||||
representations. Finally, we show that MRL extends seamlessly to web-scale
|
||||
datasets (ImageNet, JFT) across various modalities -- vision (ViT, ResNet),
|
||||
vision + language (ALIGN) and language (BERT). MRL code and pretrained models
|
||||
are open-sourced at https://github.com/RAIVNLab/MRL.
|
||||
|
||||
## Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages
|
||||
|
||||
- **arXiv id:** 2205.12654v1
|
||||
- **arXiv id:** [2205.12654v1](http://arxiv.org/abs/2205.12654v1) **Published Date:** 2022-05-25
|
||||
- **Title:** Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages
|
||||
- **Authors:** Kevin Heffernan, Onur Çelebi, Holger Schwenk
|
||||
- **Published Date:** 2022-05-25
|
||||
- **URL:** http://arxiv.org/abs/2205.12654v1
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_community...LaserEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.laser.LaserEmbeddings.html#langchain_community.embeddings.laser.LaserEmbeddings)
|
||||
@@ -778,14 +765,12 @@ encoders, mine bitexts, and validate the bitexts by training NMT systems.
|
||||
|
||||
## Evaluating the Text-to-SQL Capabilities of Large Language Models
|
||||
|
||||
- **arXiv id:** 2204.00498v1
|
||||
- **arXiv id:** [2204.00498v1](http://arxiv.org/abs/2204.00498v1) **Published Date:** 2022-03-15
|
||||
- **Title:** Evaluating the Text-to-SQL Capabilities of Large Language Models
|
||||
- **Authors:** Nitarshan Rajkumar, Raymond Li, Dzmitry Bahdanau
|
||||
- **Published Date:** 2022-03-15
|
||||
- **URL:** http://arxiv.org/abs/2204.00498v1
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_community...SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL), [langchain_community...SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase)
|
||||
- **API Reference:** [langchain_community...SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase), [langchain_community...SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL)
|
||||
|
||||
**Abstract:** We perform an empirical evaluation of Text-to-SQL capabilities of the Codex
|
||||
language model. We find that, without any finetuning, Codex is a strong
|
||||
@@ -797,14 +782,12 @@ few-shot examples.
|
||||
|
||||
## Locally Typical Sampling
|
||||
|
||||
- **arXiv id:** 2202.00666v5
|
||||
- **arXiv id:** [2202.00666v5](http://arxiv.org/abs/2202.00666v5) **Published Date:** 2022-02-01
|
||||
- **Title:** Locally Typical Sampling
|
||||
- **Authors:** Clara Meister, Tiago Pimentel, Gian Wiher, et al.
|
||||
- **Published Date:** 2022-02-01
|
||||
- **URL:** http://arxiv.org/abs/2202.00666v5
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
- **API Reference:** [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
|
||||
**Abstract:** Today's probabilistic language generators fall short when it comes to
|
||||
producing coherent and fluent text despite the fact that the underlying models
|
||||
@@ -829,11 +812,9 @@ reducing degenerate repetitions.
|
||||
|
||||
## Learning Transferable Visual Models From Natural Language Supervision
|
||||
|
||||
- **arXiv id:** 2103.00020v1
|
||||
- **arXiv id:** [2103.00020v1](http://arxiv.org/abs/2103.00020v1) **Published Date:** 2021-02-26
|
||||
- **Title:** Learning Transferable Visual Models From Natural Language Supervision
|
||||
- **Authors:** Alec Radford, Jong Wook Kim, Chris Hallacy, et al.
|
||||
- **Published Date:** 2021-02-26
|
||||
- **URL:** http://arxiv.org/abs/2103.00020v1
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_experimental.open_clip](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.open_clip)
|
||||
@@ -861,14 +842,12 @@ https://github.com/OpenAI/CLIP.
|
||||
|
||||
## CTRL: A Conditional Transformer Language Model for Controllable Generation
|
||||
|
||||
- **arXiv id:** 1909.05858v2
|
||||
- **arXiv id:** [1909.05858v2](http://arxiv.org/abs/1909.05858v2) **Published Date:** 2019-09-11
|
||||
- **Title:** CTRL: A Conditional Transformer Language Model for Controllable Generation
|
||||
- **Authors:** Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, et al.
|
||||
- **Published Date:** 2019-09-11
|
||||
- **URL:** http://arxiv.org/abs/1909.05858v2
|
||||
- **LangChain:**
|
||||
|
||||
- **API Reference:** [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
- **API Reference:** [langchain_huggingface...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference)
|
||||
|
||||
**Abstract:** Large-scale language models show promising text generation capabilities, but
|
||||
users cannot easily control particular aspects of the generated text. We
|
||||
@@ -881,32 +860,4 @@ codes also allow CTRL to predict which parts of the training data are most
|
||||
likely given a sequence. This provides a potential method for analyzing large
|
||||
amounts of data via model-based source attribution. We have released multiple
|
||||
full-sized, pretrained versions of CTRL at https://github.com/salesforce/ctrl.
|
||||
|
||||
## Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
|
||||
|
||||
- **arXiv id:** 1908.10084v1
|
||||
- **Title:** Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
|
||||
- **Authors:** Nils Reimers, Iryna Gurevych
|
||||
- **Published Date:** 2019-08-27
|
||||
- **URL:** http://arxiv.org/abs/1908.10084v1
|
||||
- **LangChain:**
|
||||
|
||||
- **Documentation:** [docs/integrations/text_embedding/sentence_transformers](https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers)
|
||||
|
||||
**Abstract:** BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new
|
||||
state-of-the-art performance on sentence-pair regression tasks like semantic
|
||||
textual similarity (STS). However, it requires that both sentences are fed into
|
||||
the network, which causes a massive computational overhead: Finding the most
|
||||
similar pair in a collection of 10,000 sentences requires about 50 million
|
||||
inference computations (~65 hours) with BERT. The construction of BERT makes it
|
||||
unsuitable for semantic similarity search as well as for unsupervised tasks
|
||||
like clustering.
|
||||
In this publication, we present Sentence-BERT (SBERT), a modification of the
|
||||
pretrained BERT network that use siamese and triplet network structures to
|
||||
derive semantically meaningful sentence embeddings that can be compared using
|
||||
cosine-similarity. This reduces the effort for finding the most similar pair
|
||||
from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while
|
||||
maintaining the accuracy from BERT.
|
||||
We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning
|
||||
tasks, where it outperforms other state-of-the-art sentence embeddings methods.
|
||||
|
||||
@@ -182,7 +182,7 @@ pprint(data)
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
Another option is set `jq_schema='.'` and provide `content_key`:
|
||||
Another option is to set `jq_schema='.'` and provide `content_key`:
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install \"unstructured[md]\""
|
||||
"%pip install \"unstructured[md]\" nltk"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -315,6 +315,15 @@ For a high-level tutorial, check out [this guide](/docs/tutorials/graph/).
|
||||
- [How to: improve results with prompting](/docs/how_to/graph_prompting)
|
||||
- [How to: construct knowledge graphs](/docs/how_to/graph_constructing)
|
||||
|
||||
### Summarization
|
||||
|
||||
LLMs can summarize and otherwise distill desired information from text, including
|
||||
large volumes of text. For a high-level tutorial, check out [this guide](/docs/tutorials/summarization).
|
||||
|
||||
- [How to: summarize text in a single LLM call](/docs/how_to/summarize_stuff)
|
||||
- [How to: summarize text through parallelization](/docs/how_to/summarize_map_reduce)
|
||||
- [How to: summarize text through iterative refinement](/docs/how_to/summarize_refine)
|
||||
|
||||
## [LangGraph](https://langchain-ai.github.io/langgraph)
|
||||
|
||||
LangGraph is an extension of LangChain aimed at
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchain-chroma bs4"
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchain-chroma beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai langchain-chroma bs4"
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai langchain-chroma beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai langchain-chroma bs4"
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai langchain-chroma beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
449
docs/docs/how_to/summarize_map_reduce.ipynb
Normal file
449
docs/docs/how_to/summarize_map_reduce.ipynb
Normal file
File diff suppressed because one or more lines are too long
333
docs/docs/how_to/summarize_refine.ipynb
Normal file
333
docs/docs/how_to/summarize_refine.ipynb
Normal file
File diff suppressed because one or more lines are too long
209
docs/docs/how_to/summarize_stuff.ipynb
Normal file
209
docs/docs/how_to/summarize_stuff.ipynb
Normal file
@@ -0,0 +1,209 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c47f5b2f-e14c-43e7-a0ab-d71562636624",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 3\n",
|
||||
"keywords: [summarize, summarization, stuff, create_stuff_documents_chain]\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "682a4f53-27db-43ef-a909-dd9ded76051b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# How to summarize text in a single LLM call\n",
|
||||
"\n",
|
||||
"LLMs can summarize and otherwise distill desired information from text, including large volumes of text. In many cases, especially for models with larger context windows, this can be adequately achieved via a single LLM call.\n",
|
||||
"\n",
|
||||
"LangChain implements a simple [pre-built chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html) that \"stuffs\" a prompt with the desired context for summarization and other purposes. In this guide we demonstrate how to use the chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4aa52e84-d1b5-4b33-b4c4-541156686ef3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load chat model\n",
|
||||
"\n",
|
||||
"Let's first load a chat model:\n",
|
||||
"```{=mdx}\n",
|
||||
"import ChatModelTabs from \"@theme/ChatModelTabs\";\n",
|
||||
"\n",
|
||||
"<ChatModelTabs\n",
|
||||
" customVarName=\"llm\"\n",
|
||||
"/>\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e5f426fc-cea6-4351-8931-1e422d3c8b69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# | output: false\n",
|
||||
"# | echo: false\n",
|
||||
"\n",
|
||||
"from langchain_openai import ChatOpenAI\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b137fe82-0a53-4910-b53e-b87a297f329d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a81dc91d-ae72-4996-b809-d4a9050e815e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we need some documents to summarize. Below, we generate some toy documents for illustrative purposes. See the document loader [how-to guides](/docs/how_to/#document-loaders) and [integration pages](/docs/integrations/document_loaders/) for additional sources of data. The [summarization tutorial](/docs/tutorials/summarization) also includes an example summarizing a blog post."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "27c8fed0-b2d7-4549-a086-f5ee657efc41",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.documents import Document\n",
|
||||
"\n",
|
||||
"documents = [\n",
|
||||
" Document(page_content=\"Apples are red\", metadata={\"title\": \"apple_book\"}),\n",
|
||||
" Document(page_content=\"Blueberries are blue\", metadata={\"title\": \"blueberry_book\"}),\n",
|
||||
" Document(page_content=\"Bananas are yelow\", metadata={\"title\": \"banana_book\"}),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "84216044-6f1e-4b90-b4fa-29ec305abf51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load chain\n",
|
||||
"\n",
|
||||
"Below, we define a simple prompt and instantiate the chain with our chat model and documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "669afa40-2708-4fa1-841e-c74a67bd9175",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
|
||||
"from langchain_core.prompts import ChatPromptTemplate\n",
|
||||
"\n",
|
||||
"prompt = ChatPromptTemplate.from_template(\"Summarize this content: {context}\")\n",
|
||||
"chain = create_stuff_documents_chain(llm, prompt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74f3e276-f003-4112-ba14-c6952076c4f8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Invoke chain\n",
|
||||
"\n",
|
||||
"Because the chain is a [Runnable](/docs/concepts/#runnable-interface), it implements the usual methods for invocation:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "0701bb7d-fbc6-497e-a577-25d56e6e43c6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The content describes the colors of three fruits: apples are red, blueberries are blue, and bananas are yellow.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result = chain.invoke({\"context\": documents})\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "14fb5647-1458-43af-afb7-5aae7b8cab1d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming\n",
|
||||
"\n",
|
||||
"Note that the chain also supports streaming of individual output tokens:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0d7a5f67-2ec8-4f90-b085-2969fcb14dce",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"|The| content| describes| the| colors| of| three| fruits|:| apples| are| red|,| blueberries| are| blue|,| and| bananas| are| yellow|.||"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for chunk in chain.stream({\"context\": documents}):\n",
|
||||
" print(chunk, end=\"|\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f15c225a-db1d-48cf-b135-f588e7d615e6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
"See the summarization [how-to guides](/docs/how_to/#summarization) for additional summarization strategies, including those designed for larger volumes of text.\n",
|
||||
"\n",
|
||||
"See also [this tutorial](/docs/tutorials/summarization) for more detail on summarization."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -37,7 +37,8 @@
|
||||
"%pip install --upgrade --quiet infinopy\n",
|
||||
"%pip install --upgrade --quiet matplotlib\n",
|
||||
"%pip install --upgrade --quiet tiktoken\n",
|
||||
"%pip install --upgrade --quiet langchain langchain-openai langchain-community"
|
||||
"%pip install --upgrade --quiet langchain langchain-openai langchain-community\n",
|
||||
"%pip install --upgrade --quiet beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -110,7 +110,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 9,
|
||||
"id": "cb09c344-1836-4e0c-acf8-11d13ac1dbae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -118,7 +118,7 @@
|
||||
"from langchain_ollama import ChatOllama\n",
|
||||
"\n",
|
||||
"llm = ChatOllama(\n",
|
||||
" model=\"llama3\",\n",
|
||||
" model=\"llama3.1\",\n",
|
||||
" temperature=0,\n",
|
||||
" # other params...\n",
|
||||
")"
|
||||
@@ -134,7 +134,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 10,
|
||||
"id": "62e0dbc3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -143,10 +143,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Je adore le programmation.\\n\\n(Note: \"programmation\" is not commonly used in French, but I translated it as \"le programmation\" to maintain the same grammatical structure and meaning as the original English sentence.)', response_metadata={'model': 'llama3', 'created_at': '2024-07-22T17:43:54.731273Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 11094839375, 'load_duration': 10121854667, 'prompt_eval_count': 36, 'prompt_eval_duration': 146569000, 'eval_count': 46, 'eval_duration': 816593000}, id='run-befccbdc-e1f9-42a9-85cf-e69b926d6b8b-0', usage_metadata={'input_tokens': 36, 'output_tokens': 46, 'total_tokens': 82})"
|
||||
"AIMessage(content='The translation of \"I love programming\" from English to French is:\\n\\n\"J\\'adore programmer.\"', response_metadata={'model': 'llama3.1', 'created_at': '2024-08-19T16:05:32.81965Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2167842917, 'load_duration': 54222584, 'prompt_eval_count': 35, 'prompt_eval_duration': 893007000, 'eval_count': 22, 'eval_duration': 1218962000}, id='run-0863daa2-43bf-4a43-86cc-611b23eae466-0', usage_metadata={'input_tokens': 35, 'output_tokens': 22, 'total_tokens': 57})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -167,7 +167,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 11,
|
||||
"id": "d86145b3-bfef-46e8-b227-4dda5c9c2705",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -175,9 +175,9 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Je adore le programmation.\n",
|
||||
"The translation of \"I love programming\" from English to French is:\n",
|
||||
"\n",
|
||||
"(Note: \"programmation\" is not commonly used in French, but I translated it as \"le programmation\" to maintain the same grammatical structure and meaning as the original English sentence.)\n"
|
||||
"\"J'adore programmer.\"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -197,17 +197,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 12,
|
||||
"id": "e197d1d7-a070-4c96-9f8a-a0e86d046e0b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Ich liebe Programmieren!\\n\\n(Note: \"Ich liebe\" means \"I love\", \"Programmieren\" is the verb for \"programming\")', response_metadata={'model': 'llama3', 'created_at': '2024-07-04T04:22:33.864132Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 1310800083, 'load_duration': 1782000, 'prompt_eval_count': 16, 'prompt_eval_duration': 250199000, 'eval_count': 29, 'eval_duration': 1057192000}, id='run-cbadbe59-2de2-4ec0-a18a-b3220226c3d2-0')"
|
||||
"AIMessage(content='Das Programmieren ist mir ein Leidenschaft! (That\\'s \"Programming is my passion!\" in German.) Would you like me to translate anything else?', response_metadata={'model': 'llama3.1', 'created_at': '2024-08-19T16:05:34.893548Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2045997333, 'load_duration': 22584792, 'prompt_eval_count': 30, 'prompt_eval_duration': 213210000, 'eval_count': 32, 'eval_duration': 1808541000}, id='run-d18e1c6b-50e0-4b1d-b23a-973fa058edad-0', usage_metadata={'input_tokens': 30, 'output_tokens': 32, 'total_tokens': 62})"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -242,33 +242,32 @@
|
||||
"source": [
|
||||
"## Tool calling\n",
|
||||
"\n",
|
||||
"We can use [tool calling](https://blog.langchain.dev/improving-core-tool-interfaces-and-docs-in-langchain/) with an LLM [that has been fine-tuned for tool use](https://ollama.com/library/llama3-groq-tool-use): \n",
|
||||
"We can use [tool calling](https://blog.langchain.dev/improving-core-tool-interfaces-and-docs-in-langchain/) with an LLM [that has been fine-tuned for tool use](https://ollama.com/library/llama3.1): \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"ollama pull llama3-groq-tool-use\n",
|
||||
"ollama pull llama3.1\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"We can just pass normal Python functions directly as tools."
|
||||
"Details on creating custom tools are available in [this guide](/docs/how_to/custom_tools/). Below, we demonstrate how to create a tool using the `@tool` decorator on a normal python function."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "5250bceb-1029-41ff-b447-983518704d88",
|
||||
"execution_count": 13,
|
||||
"id": "f767015f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'name': 'validate_user',\n",
|
||||
" 'args': {'addresses': ['123 Fake St, Boston MA',\n",
|
||||
" '234 Pretend Boulevard, Houston TX'],\n",
|
||||
" 'user_id': 123},\n",
|
||||
" 'id': 'fe2148d3-95fb-48e9-845a-4bfecc1f1f96',\n",
|
||||
" 'args': {'addresses': '[\"123 Fake St, Boston, MA\", \"234 Pretend Boulevard, Houston, TX\"]',\n",
|
||||
" 'user_id': '123'},\n",
|
||||
" 'id': '40fe3de0-500c-4b91-9616-5932a929e640',\n",
|
||||
" 'type': 'tool_call'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -276,22 +275,23 @@
|
||||
"source": [
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"from langchain_core.tools import tool\n",
|
||||
"from langchain_ollama import ChatOllama\n",
|
||||
"from typing_extensions import TypedDict\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def validate_user(user_id: int, addresses: List) -> bool:\n",
|
||||
"@tool\n",
|
||||
"def validate_user(user_id: int, addresses: List[str]) -> bool:\n",
|
||||
" \"\"\"Validate user using historical addresses.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" user_id: (int) the user ID.\n",
|
||||
" addresses: Previous addresses.\n",
|
||||
" user_id (int): the user ID.\n",
|
||||
" addresses (List[str]): Previous addresses as a list of strings.\n",
|
||||
" \"\"\"\n",
|
||||
" return True\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"llm = ChatOllama(\n",
|
||||
" model=\"llama3-groq-tool-use\",\n",
|
||||
" model=\"llama3.1\",\n",
|
||||
" temperature=0,\n",
|
||||
").bind_tools([validate_user])\n",
|
||||
"\n",
|
||||
@@ -303,18 +303,6 @@
|
||||
"result.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2bb034ff-218f-4865-afea-3f5e57d3bdee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We look at the LangSmith trace to see that the tool call was performed: \n",
|
||||
"\n",
|
||||
"https://smith.langchain.com/public/4169348a-d6be-45df-a7cf-032f6baa4697/r\n",
|
||||
"\n",
|
||||
"In particular, the trace shows how the tool schema was populated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4c5e0197",
|
||||
@@ -331,7 +319,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 15,
|
||||
"id": "36c9b1c2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -391,7 +379,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 16,
|
||||
"id": "32b3ba7b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -467,7 +455,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
"version": "3.12.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,322 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: Ollama Functions\n",
|
||||
"sidebar_class_name: hidden\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OllamaFunctions\n",
|
||||
"\n",
|
||||
":::warning\n",
|
||||
"\n",
|
||||
"This was an experimental wrapper that attempts to bolt-on tool calling support to models that do not natively support it. The [primary Ollama integration](/docs/integrations/chat/ollama/) now supports tool calling, and should be used instead.\n",
|
||||
"\n",
|
||||
":::\n",
|
||||
"This notebook shows how to use an experimental wrapper around Ollama that gives it [tool calling capabilities](https://python.langchain.com/v0.2/docs/concepts/#functiontool-calling).\n",
|
||||
"\n",
|
||||
"Note that more powerful and capable models will perform better with complex schema and/or multiple functions. The examples below use llama3 and phi3 models.\n",
|
||||
"For a complete list of supported models and model variants, see the [Ollama model library](https://ollama.ai/library).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support | Package downloads | Package latest |\n",
|
||||
"|:-----------------------------------------------------------------------------------------------------------------------------------:|:-------:|:-----:|:------------:|:----------:|:-----------------:|:--------------:|\n",
|
||||
"| [OllamaFunctions](https://api.python.langchain.com/en/latest/llms/langchain_experimental.llms.ollama_function.OllamaFunctions.html) | [langchain-experimental](https://api.python.langchain.com/en/latest/openai_api_reference.html) | ✅ | ❌ | ❌ |  |  |\n",
|
||||
"\n",
|
||||
"### Model features\n",
|
||||
"\n",
|
||||
"| [Tool calling](/docs/how_to/tool_calling/) | [Structured output](/docs/how_to/structured_output/) | JSON mode | Image input | Audio input | Video input | [Token-level streaming](/docs/how_to/chat_streaming/) | Native async | [Token usage](/docs/how_to/chat_token_usage_tracking/) | [Logprobs](/docs/how_to/logprobs/) |\n",
|
||||
"| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |\n",
|
||||
"| ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |\n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access `OllamaFunctions` you will need to install `langchain-experimental` integration package.\n",
|
||||
"Follow [these instructions](https://github.com/jmorganca/ollama) to set up and run a local Ollama instance as well as download and serve [supported models](https://ollama.com/library).\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"Credentials support is not present at this time.\n",
|
||||
"\n",
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"The `OllamaFunctions` class lives in the `langchain-experimental` package:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-experimental"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Instantiation\n",
|
||||
"\n",
|
||||
"`OllamaFunctions` takes the same init parameters as `ChatOllama`. \n",
|
||||
"\n",
|
||||
"In order to use tool calling, you must also specify `format=\"json\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-06-23T15:20:21.818089Z",
|
||||
"start_time": "2024-06-23T15:20:21.815759Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_experimental.llms.ollama_functions import OllamaFunctions\n",
|
||||
"\n",
|
||||
"llm = OllamaFunctions(model=\"phi3\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Invocation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-06-23T15:20:46.794689Z",
|
||||
"start_time": "2024-06-23T15:20:44.982632Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"J'adore programmer.\", id='run-94815fcf-ae11-438a-ba3f-00819328b5cd-0')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" (\n",
|
||||
" \"system\",\n",
|
||||
" \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n",
|
||||
" ),\n",
|
||||
" (\"human\", \"I love programming.\"),\n",
|
||||
"]\n",
|
||||
"ai_msg = llm.invoke(messages)\n",
|
||||
"ai_msg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"J'adore programmer.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ai_msg.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Chaining\n",
|
||||
"\n",
|
||||
"We can [chain](https://python.langchain.com/v0.2/docs/how_to/sequence/) our model with a prompt template like so:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Programmieren ist sehr verrückt! Es freut mich, dass Sie auf Programmierung so positiv eingestellt sind.', id='run-ee99be5e-4d48-4ab6-b602-35415f0bdbde-0')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_core.prompts import ChatPromptTemplate\n",
|
||||
"\n",
|
||||
"prompt = ChatPromptTemplate.from_messages(\n",
|
||||
" [\n",
|
||||
" (\n",
|
||||
" \"system\",\n",
|
||||
" \"You are a helpful assistant that translates {input_language} to {output_language}.\",\n",
|
||||
" ),\n",
|
||||
" (\"human\", \"{input}\"),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"chain = prompt | llm\n",
|
||||
"chain.invoke(\n",
|
||||
" {\n",
|
||||
" \"input_language\": \"English\",\n",
|
||||
" \"output_language\": \"German\",\n",
|
||||
" \"input\": \"I love programming.\",\n",
|
||||
" }\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tool Calling\n",
|
||||
"\n",
|
||||
"### OllamaFunctions.bind_tools()\n",
|
||||
"\n",
|
||||
"With `OllamaFunctions.bind_tools`, we can easily pass in Pydantic classes, dict schemas, LangChain tools, or even functions as tools to the model. Under the hood these are converted to a tool definition schemas, which looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.pydantic_v1 import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class GetWeather(BaseModel):\n",
|
||||
" \"\"\"Get the current weather in a given location\"\"\"\n",
|
||||
"\n",
|
||||
" location: str = Field(..., description=\"The city and state, e.g. San Francisco, CA\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"llm_with_tools = llm.bind_tools([GetWeather])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='', id='run-b9769435-ec6a-4cb8-8545-5a5035fc19bd-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'call_064c4e1cb27e4adb9e4e7ed60362ecc9'}])"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ai_msg = llm_with_tools.invoke(\n",
|
||||
" \"what is the weather like in San Francisco\",\n",
|
||||
")\n",
|
||||
"ai_msg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### AIMessage.tool_calls\n",
|
||||
"\n",
|
||||
"Notice that the AIMessage has a `tool_calls` attribute. This contains in a standardized `ToolCall` format that is model-provider agnostic."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'name': 'GetWeather',\n",
|
||||
" 'args': {'location': 'San Francisco, CA'},\n",
|
||||
" 'id': 'call_064c4e1cb27e4adb9e4e7ed60362ecc9'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ai_msg.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For more on binding tools and tool call outputs, head to the [tool calling](../../how_to/function_calling.ipynb) docs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all ToolCallingLLM features and configurations head to the API reference: https://api.python.langchain.com/en/latest/llms/langchain_experimental.llms.ollama_functions.OllamaFunctions.html\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -151,7 +151,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"id": "ce16ad78-8e6f-48cd-954e-98be75eb5836",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -160,10 +160,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 31, 'total_tokens': 36}, 'model_name': 'gpt-4o', 'system_fingerprint': 'fp_43dfabdef1', 'finish_reason': 'stop', 'logprobs': None}, id='run-012cffe2-5d3d-424d-83b5-51c6d4a593d1-0', usage_metadata={'input_tokens': 31, 'output_tokens': 5, 'total_tokens': 36})"
|
||||
"AIMessage(content=\"J'adore la programmation.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 31, 'total_tokens': 36}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3aa7262c27', 'finish_reason': 'stop', 'logprobs': None}, id='run-63219b22-03e3-4561-8cc4-78b7c7c3a3ca-0', usage_metadata={'input_tokens': 31, 'output_tokens': 5, 'total_tokens': 36})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -182,7 +182,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 4,
|
||||
"id": "2cd224b8-4499-41fb-a604-d53a7ff17b2e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -210,7 +210,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 5,
|
||||
"id": "fbb043e6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -219,10 +219,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Ich liebe Programmieren.', response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 26, 'total_tokens': 31}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_b28b39ffa8', 'finish_reason': 'stop', 'logprobs': None}, id='run-94fa6741-c99b-4513-afce-c3f562631c79-0')"
|
||||
"AIMessage(content='Ich liebe das Programmieren.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 26, 'total_tokens': 32}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3aa7262c27', 'finish_reason': 'stop', 'logprobs': None}, id='run-350585e1-16ca-4dad-9460-3d9e7e49aaf1-0', usage_metadata={'input_tokens': 26, 'output_tokens': 6, 'total_tokens': 32})"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -274,7 +274,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"id": "b7ea7690-ec7a-4337-b392-e87d1f39a6ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -293,17 +293,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 7,
|
||||
"id": "1d1ab955-6a68-42f8-bb5d-86eb1111478a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_H7fABDuzEau48T10Qn0Lsh0D', 'function': {'arguments': '{\"location\":\"San Francisco\"}', 'name': 'GetWeather'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 70, 'total_tokens': 85}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_b28b39ffa8', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-b469135e-2718-446a-8164-eef37e672ba2-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco'}, 'id': 'call_H7fABDuzEau48T10Qn0Lsh0D'}])"
|
||||
"AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_o9udf3EVOWiV4Iupktpbpofk', 'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'GetWeather'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 68, 'total_tokens': 85}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3aa7262c27', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-1617c9b2-dda5-4120-996b-0333ed5992e2-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'call_o9udf3EVOWiV4Iupktpbpofk', 'type': 'tool_call'}], usage_metadata={'input_tokens': 68, 'output_tokens': 17, 'total_tokens': 85})"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -333,17 +333,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 8,
|
||||
"id": "dc8ac4f1-4039-4392-90c1-2d8331cd6910",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_VYEfpPDh3npMQ95J9EWmWvSn', 'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'GetWeather'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 68, 'total_tokens': 85}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3aa7262c27', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-a4c6749b-adbb-45c7-8b17-8d6835d5c443-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'call_VYEfpPDh3npMQ95J9EWmWvSn', 'type': 'tool_call'}], usage_metadata={'input_tokens': 68, 'output_tokens': 17, 'total_tokens': 85})"
|
||||
"AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_jUqhd8wzAIzInTJl72Rla8ht', 'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'GetWeather'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 68, 'total_tokens': 85}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3aa7262c27', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-5e3356a9-132d-4623-8e73-dd5a898cf4a6-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'call_jUqhd8wzAIzInTJl72Rla8ht', 'type': 'tool_call'}], usage_metadata={'input_tokens': 68, 'output_tokens': 17, 'total_tokens': 85})"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -367,7 +367,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 9,
|
||||
"id": "166cb7ce-831d-4a7c-9721-abc107f11084",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -375,11 +375,12 @@
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'name': 'GetWeather',\n",
|
||||
" 'args': {'location': 'San Francisco'},\n",
|
||||
" 'id': 'call_H7fABDuzEau48T10Qn0Lsh0D'}]"
|
||||
" 'args': {'location': 'San Francisco, CA'},\n",
|
||||
" 'id': 'call_jUqhd8wzAIzInTJl72Rla8ht',\n",
|
||||
" 'type': 'tool_call'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -410,17 +411,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 11,
|
||||
"id": "33c4a8b0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"J'adore la programmation.\", additional_kwargs={}, example=False)"
|
||||
"AIMessage(content=\"J'adore la programmation.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 31, 'total_tokens': 39}, 'model_name': 'ft:gpt-3.5-turbo-0613:langchain::7qTVM5AR', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-0f39b30e-c56e-4f3b-af99-5c948c984146-0', usage_metadata={'input_tokens': 31, 'output_tokens': 8, 'total_tokens': 39})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -430,7 +431,7 @@
|
||||
" temperature=0, model_name=\"ft:gpt-3.5-turbo-0613:langchain::7qTVM5AR\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fine_tuned_model(messages)"
|
||||
"fine_tuned_model.invoke(messages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -446,9 +447,9 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "poetry-venv-311",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "poetry-venv-311"
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -460,7 +461,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -53,7 +53,8 @@
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
|
||||
"if \"TOGETHER_API_KEY\" not in os.environ:\n",
|
||||
" os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -87,21 +88,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "652d6238-1f87-422a-b135-f5abbb8652fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-together"
|
||||
]
|
||||
@@ -113,14 +103,12 @@
|
||||
"source": [
|
||||
"## Instantiation\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and generate chat completions:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
"Now we can instantiate our model object and generate chat completions:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 3,
|
||||
"id": "cb09c344-1836-4e0c-acf8-11d13ac1dbae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -147,7 +135,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"id": "62e0dbc3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -156,10 +144,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-79efa49b-dbaf-4ef8-9dce-958533823ef6-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
|
||||
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-eabcbe33-cdd8-45b8-ab0b-f90b6e7dfad8-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -178,7 +166,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 5,
|
||||
"id": "d86145b3-bfef-46e8-b227-4dda5c9c2705",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -206,17 +194,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"id": "e197d1d7-a070-4c96-9f8a-a0e86d046e0b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-80bba5fa-1723-4242-8d5a-c09b76b8350b-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
|
||||
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a249aa24-ee31-46ba-9bf9-f4eb135b0a95-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -271,7 +259,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
243
docs/docs/integrations/document_loaders/bshtml.ipynb
Normal file
243
docs/docs/integrations/document_loaders/bshtml.ipynb
Normal file
@@ -0,0 +1,243 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# BSHTMLLoader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with BeautifulSoup4 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [BSHTMLLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| BSHTMLLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access BSHTMLLoader document loader you'll need to install the `langchain-community` integration package and the `bs4` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use the `BSHTMLLoader` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **bs4**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community bs4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import BSHTMLLoader\n",
|
||||
"\n",
|
||||
"loader = BSHTMLLoader(\n",
|
||||
" file_path=\"./example_data/fake-content.html\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []\n",
|
||||
"page[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Adding separator to BS4\n",
|
||||
"\n",
|
||||
"We can also pass a separator to use when calling get_text on the soup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='\n",
|
||||
", Test Title, \n",
|
||||
", \n",
|
||||
", \n",
|
||||
", My First Heading, \n",
|
||||
", My first paragraph., \n",
|
||||
", \n",
|
||||
", \n",
|
||||
"' metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = BSHTMLLoader(\n",
|
||||
" file_path=\"./example_data/fake-content.html\", get_text_separator=\", \"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all BSHTMLLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
# Sample Markdown Document
|
||||
|
||||
## Introduction
|
||||
|
||||
Welcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It's widely used for documentation, readme files, and more.
|
||||
|
||||
## Features
|
||||
|
||||
### Headers
|
||||
|
||||
Markdown supports multiple levels of headers:
|
||||
|
||||
- **Header 1**: `# Header 1`
|
||||
- **Header 2**: `## Header 2`
|
||||
- **Header 3**: `### Header 3`
|
||||
|
||||
### Lists
|
||||
|
||||
#### Unordered List
|
||||
|
||||
- Item 1
|
||||
- Item 2
|
||||
- Subitem 2.1
|
||||
- Subitem 2.2
|
||||
|
||||
#### Ordered List
|
||||
|
||||
1. First item
|
||||
2. Second item
|
||||
3. Third item
|
||||
|
||||
### Links
|
||||
|
||||
[OpenAI](https://www.openai.com) is an AI research organization.
|
||||
|
||||
### Images
|
||||
|
||||
Here's an example image:
|
||||
|
||||

|
||||
|
||||
### Code
|
||||
|
||||
#### Inline Code
|
||||
|
||||
Use `code` for inline code snippets.
|
||||
|
||||
#### Code Block
|
||||
|
||||
```python
|
||||
def greet(name):
|
||||
return f"Hello, {name}!"
|
||||
|
||||
print(greet("World"))
|
||||
```
|
||||
@@ -30,6 +30,7 @@
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595060730,
|
||||
"content": "",
|
||||
"photos": [
|
||||
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
|
||||
]
|
||||
|
||||
@@ -21,24 +21,24 @@ loader = CSVLoader(
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
## Common File Types
|
||||
|
||||
The below document loaders allow you to load data from common data formats.
|
||||
|
||||
<CategoryTable category="common_loaders" />
|
||||
|
||||
## PDFs
|
||||
|
||||
The below document loaders allow you to load documents.
|
||||
|
||||
<CategoryTable category="pdf_loaders" />
|
||||
|
||||
## Webpages
|
||||
|
||||
The below document loaders allow you to load webpages.
|
||||
|
||||
<CategoryTable category="webpage_loaders" />
|
||||
|
||||
## PDFs
|
||||
|
||||
The below document loaders allow you to load PDF documents.
|
||||
|
||||
<CategoryTable category="pdf_loaders" />
|
||||
|
||||
## Common File Types
|
||||
|
||||
The below document loaders allow you to load data from common data formats.
|
||||
|
||||
<CategoryTable category="common_loaders" />
|
||||
|
||||
|
||||
## All document loaders
|
||||
|
||||
|
||||
348
docs/docs/integrations/document_loaders/json.ipynb
Normal file
348
docs/docs/integrations/document_loaders/json.ipynb
Normal file
@@ -0,0 +1,348 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# JSONLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with JSON [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all JSONLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html).\n",
|
||||
"\n",
|
||||
"- TODO: Add any other relevant links, like information about underlying API, etc.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/json/)|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [JSONLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ✅ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| JSONLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access JSON document loader you'll need to install the `langchain-community` integration package as well as the ``jq`` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are required to use the `JSONLoader` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **jq**:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community jq "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import JSONLoader\n",
|
||||
"\n",
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[].content\",\n",
|
||||
" text_content=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}, page_content='Bye!')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pages = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" pages.append(doc)\n",
|
||||
" if len(pages) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(pages)\n",
|
||||
"\n",
|
||||
" pages = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Read from JSON Lines file\n",
|
||||
"\n",
|
||||
"If you want to load documents from a JSON Lines file, you pass `json_lines=True`\n",
|
||||
"and specify `jq_schema` to extract `page_content` from a single JSON object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
|
||||
" jq_schema=\".content\",\n",
|
||||
" text_content=False,\n",
|
||||
" json_lines=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Read specific content keys\n",
|
||||
"\n",
|
||||
"Another option is to set `jq_schema='.'` and provide a `content_key` in order to only load specific content:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='User 2' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
|
||||
" jq_schema=\".\",\n",
|
||||
" content_key=\"sender_name\",\n",
|
||||
" json_lines=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## JSON file with jq schema `content_key`\n",
|
||||
"\n",
|
||||
"To load documents from a JSON file using the `content_key` within the jq schema, set `is_content_key_jq_parsable=True`. Ensure that `content_key` is compatible and can be parsed using the jq schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[]\",\n",
|
||||
" content_key=\".content\",\n",
|
||||
" is_content_key_jq_parsable=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extracting metadata\n",
|
||||
"\n",
|
||||
"Generally, we want to include metadata available in the JSON file into the documents that we create from the content.\n",
|
||||
"\n",
|
||||
"The following demonstrates how metadata can be extracted using the `JSONLoader`.\n",
|
||||
"\n",
|
||||
"There are some key changes to be noted. In the previous example where we didn't collect the metadata, we managed to directly specify in the schema where the value for the `page_content` can be extracted from.\n",
|
||||
"\n",
|
||||
"In this example, we have to tell the loader to iterate over the records in the `messages` field. The jq_schema then has to be `.messages[]`\n",
|
||||
"\n",
|
||||
"This allows us to pass the records (dict) into the `metadata_func` that has to be implemented. The `metadata_func` is responsible for identifying which pieces of information in the record should be included in the metadata stored in the final `Document` object.\n",
|
||||
"\n",
|
||||
"Additionally, we now have to explicitly specify in the loader, via the `content_key` argument, the key from the record where the value for the `page_content` needs to be extracted from."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Define the metadata extraction function.\n",
|
||||
"def metadata_func(record: dict, metadata: dict) -> dict:\n",
|
||||
" metadata[\"sender_name\"] = record.get(\"sender_name\")\n",
|
||||
" metadata[\"timestamp_ms\"] = record.get(\"timestamp_ms\")\n",
|
||||
"\n",
|
||||
" return metadata\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[]\",\n",
|
||||
" content_key=\"content\",\n",
|
||||
" metadata_func=metadata_func,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all JSONLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
294
docs/docs/integrations/document_loaders/langsmith.ipynb
Normal file
294
docs/docs/integrations/document_loaders/langsmith.ipynb
Normal file
@@ -0,0 +1,294 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: LangSmith\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LangSmithLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with the LangSmith [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all LangSmithLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [LangSmithLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html) | [langchain-core](https://api.python.langchain.com/en/latest/core_api_reference.html) | ❌ | ❌ | ❌ | \n",
|
||||
"\n",
|
||||
"### Loader features\n",
|
||||
"| Source | Lazy loading | Native async\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| LangSmithLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access the LangSmith document loader you'll need to install `langchain-core`, create a [LangSmith](https://langsmith.com) account and get an API key.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"Sign up at https://langsmith.com and generate an API key. Once you've done this set the LANGSMITH_API_KEY environment variable:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"if not os.environ.get(\"LANGSMITH_API_KEY\"):\n",
|
||||
" os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best-in-class tracing, you can also turn on LangSmith tracing:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install `langchain-core`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-core"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Clone example dataset\n",
|
||||
"\n",
|
||||
"For this example, we'll clone and load a public LangSmith dataset. Cloning creates a copy of this dataset on our personal LangSmith account. You can only load datasets that you have a personal copy of."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langsmith import Client as LangSmithClient\n",
|
||||
"\n",
|
||||
"ls_client = LangSmithClient()\n",
|
||||
"\n",
|
||||
"dataset_name = \"LangSmith Few Shot Datasets Notebook\"\n",
|
||||
"dataset_public_url = (\n",
|
||||
" \"https://smith.langchain.com/public/55658626-124a-4223-af45-07fb774a6212/d\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ls_client.clone_public_dataset(dataset_public_url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our document loader and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.document_loaders import LangSmithLoader\n",
|
||||
"\n",
|
||||
"loader = LangSmithLoader(\n",
|
||||
" dataset_name=dataset_name,\n",
|
||||
" content_key=\"question\",\n",
|
||||
" limit=50,\n",
|
||||
" # format_content=...,\n",
|
||||
" # ...\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'question': 'Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. '}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata[\"inputs\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'answer': 'To customize the Weaviate client and return the top 10 k nearest neighbors, you can utilize the `as_retriever` method with the appropriate parameters. Here\\'s how you can achieve this:\\n\\n```python\\n# Assuming you have imported the necessary modules and classes\\n\\n# Create the Weaviate client\\nclient = weaviate.Client(url=os.environ[\"WEAVIATE_URL\"], ...)\\n\\n# Initialize the Weaviate wrapper\\nweaviate = Weaviate(client, index_name, text_key)\\n\\n# Customize the client to return top 10 k nearest neighbors using as_retriever\\ncustom_retriever = weaviate.as_retriever(\\n search_type=\"similarity\",\\n search_kwargs={\\n \\'k\\': 10 # Customize the value of k as needed\\n }\\n)\\n\\n# Now you can use the custom_retriever to perform searches\\nresults = custom_retriever.search(query, ...)\\n```'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata[\"outputs\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['dataset_id',\n",
|
||||
" 'inputs',\n",
|
||||
" 'outputs',\n",
|
||||
" 'metadata',\n",
|
||||
" 'id',\n",
|
||||
" 'created_at',\n",
|
||||
" 'modified_at',\n",
|
||||
" 'runs',\n",
|
||||
" 'source_run_id']"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(docs[0].metadata.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"10"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
" # page = []\n",
|
||||
" break\n",
|
||||
"len(page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all LangSmithLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "poetry-venv-311",
|
||||
"language": "python",
|
||||
"name": "poetry-venv-311"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
178
docs/docs/integrations/document_loaders/mathpix.ipynb
Normal file
178
docs/docs/integrations/document_loaders/mathpix.ipynb
Normal file
@@ -0,0 +1,178 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# MathPixPDFLoader\n",
|
||||
"\n",
|
||||
"Inspired by Daniel Gross's snippet here: [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [MathPixPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| MathPixPDFLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"Sign up for Mathpix and [create an API key](https://mathpix.com/docs/ocr/creating-an-api-key) to set the `MATHPIX_API_KEY` variables in your environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"if \"MATHPIX_API_KEY\" not in os.environ:\n",
|
||||
" os.environ[\"MATHPIX_API_KEY\"] = getpass.getpass(\"Enter your Mathpix API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we are ready to initialize our loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import MathpixPDFLoader\n",
|
||||
"\n",
|
||||
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
|
||||
"loader = MathpixPDFLoader(file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all MathpixPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
317
docs/docs/integrations/document_loaders/pdfminer.ipynb
Normal file
317
docs/docs/integrations/document_loaders/pdfminer.ipynb
Normal file
File diff suppressed because one or more lines are too long
183
docs/docs/integrations/document_loaders/pdfplumber.ipynb
Normal file
183
docs/docs/integrations/document_loaders/pdfplumber.ipynb
Normal file
@@ -0,0 +1,183 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PDFPlumber\n",
|
||||
"\n",
|
||||
"Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PDFPlumberLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PDFPlumberLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PDFPlumberLoader\n",
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\"./example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\\n1202\\nnuJ\\n12\\n]VC.sc[\\n2v84351.3012:viXra\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PDFPlumberLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
185
docs/docs/integrations/document_loaders/pymupdf.ipynb
Normal file
185
docs/docs/integrations/document_loaders/pymupdf.ipynb
Normal file
@@ -0,0 +1,185 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyMuPDF\n",
|
||||
"\n",
|
||||
"`PyMuPDF` is optimized for speed, and contains detailed metadata about the PDF and its pages. It returns one document per page.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyMuPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyMuPDFLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use the `PyMuPDFLoader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **pymupdf**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-community pymupdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can initialize our loader and start loading documents. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyMuPDFLoader\n",
|
||||
"\n",
|
||||
"loader = PyMuPDFLoader(\"./example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load\n",
|
||||
"\n",
|
||||
"You can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 (\\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: Document Image Analysis · Deep Learning · Layout Analysis\\n· Character Recognition · Open Source library · Toolkit.\\n1\\nIntroduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [11,\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyMuPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
187
docs/docs/integrations/document_loaders/pypdfdirectory.ipynb
Normal file
187
docs/docs/integrations/document_loaders/pypdfdirectory.ipynb
Normal file
@@ -0,0 +1,187 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyPDFDirectoryLoader\n",
|
||||
"\n",
|
||||
"This loader loads all PDF files from a specific directory.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyPDFDirectoryLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyPDFDirectoryLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed for this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFDirectoryLoader\n",
|
||||
"\n",
|
||||
"directory_path = (\n",
|
||||
" \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n",
|
||||
")\n",
|
||||
"loader = PyPDFDirectoryLoader(\"example_data/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': 'example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser : A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\n{melissadell,jacob carlson }@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis ·Deep Learning ·Layout Analysis\\n·Character Recognition ·Open Source library ·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': 'example_data/layout-parser-paper.pdf', 'page': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyPDFDirectoryLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
188
docs/docs/integrations/document_loaders/pypdfium2.ipynb
Normal file
188
docs/docs/integrations/document_loaders/pypdfium2.ipynb
Normal file
@@ -0,0 +1,188 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyPDFium2Loader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with PyPDFium2 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyPDFium2Loader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyPDFium2Loader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"To access PyPDFium2 document loader you'll need to install the `langchain-community` integration package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFium2Loader\n",
|
||||
"\n",
|
||||
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
|
||||
"loader = PyPDFium2Loader(file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser: A Unified Toolkit for Deep\\r\\nLearning Based Document Image Analysis\\r\\nZejiang Shen\\r\\n1\\r\\n(), Ruochen Zhang\\r\\n2\\r\\n, Melissa Dell\\r\\n3\\r\\n, Benjamin Charles Germain\\r\\nLee\\r\\n4\\r\\n, Jacob Carlson\\r\\n3\\r\\n, and Weining Li\\r\\n5\\r\\n1 Allen Institute for AI\\r\\nshannons@allenai.org 2 Brown University\\r\\nruochen zhang@brown.edu 3 Harvard University\\r\\n{melissadell,jacob carlson}@fas.harvard.edu\\r\\n4 University of Washington\\r\\nbcgl@cs.washington.edu 5 University of Waterloo\\r\\nw422li@uwaterloo.ca\\r\\nAbstract. Recent advances in document image analysis (DIA) have been\\r\\nprimarily driven by the application of neural networks. Ideally, research\\r\\noutcomes could be easily deployed in production and extended for further\\r\\ninvestigation. However, various factors like loosely organized codebases\\r\\nand sophisticated model configurations complicate the easy reuse of im\\x02portant innovations by a wide audience. Though there have been on-going\\r\\nefforts to improve reusability and simplify deep learning (DL) model\\r\\ndevelopment in disciplines like natural language processing and computer\\r\\nvision, none of them are optimized for challenges in the domain of DIA.\\r\\nThis represents a major gap in the existing toolkit, as DIA is central to\\r\\nacademic research across a wide range of disciplines in the social sciences\\r\\nand humanities. This paper introduces LayoutParser, an open-source\\r\\nlibrary for streamlining the usage of DL in DIA research and applica\\x02tions. The core LayoutParser library comes with a set of simple and\\r\\nintuitive interfaces for applying and customizing DL models for layout de\\x02tection, character recognition, and many other document processing tasks.\\r\\nTo promote extensibility, LayoutParser also incorporates a community\\r\\nplatform for sharing both pre-trained models and full document digiti\\x02zation pipelines. We demonstrate that LayoutParser is helpful for both\\r\\nlightweight and large-scale digitization pipelines in real-word use cases.\\r\\nThe library is publicly available at https://layout-parser.github.io.\\r\\nKeywords: Document Image Analysis· Deep Learning· Layout Analysis\\r\\n· Character Recognition· Open Source library· Toolkit.\\r\\n1 Introduction\\r\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\r\\ndocument image analysis (DIA) tasks including document image classification [11,\\r\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'page': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyPDFium2Loader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -6,9 +6,35 @@
|
||||
"source": [
|
||||
"# Sitemap\n",
|
||||
"\n",
|
||||
"Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.\n",
|
||||
"Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrapes and loads all pages in the sitemap, returning each page as a Document.\n",
|
||||
"\n",
|
||||
"The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!"
|
||||
"The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load you can increase this limit. Note, while this will speed up the scraping process, it may cause the server to block you. Be careful!\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/web_loaders/sitemap/)|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [SiteMapLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html#langchain_community.document_loaders.sitemap.SitemapLoader) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ✅ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| SiteMapLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access SiteMap document loader you'll need to install the `langchain-community` integration package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to run this."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -17,21 +43,55 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet nest_asyncio"
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Fix notebook asyncio bug"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# fixes a bug with asyncio and jupyter\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -43,13 +103,63 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sitemap_loader = SitemapLoader(web_path=\"https://api.python.langchain.com/sitemap.xml\")\n",
|
||||
"\n",
|
||||
"docs = sitemap_loader.load()"
|
||||
"sitemap_loader = SitemapLoader(web_path=\"https://api.python.langchain.com/sitemap.xml\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching pages: 100%|##########| 28/28 [00:04<00:00, 6.83it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-05-15T00:29:42.163001+00:00', 'changefreq': 'weekly', 'priority': '1'}, page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = sitemap_loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-05-15T00:29:42.163001+00:00', 'changefreq': 'weekly', 'priority': '1'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -71,24 +181,37 @@
|
||||
"sitemap_loader.requests_kwargs = {\"verify\": False}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load\n",
|
||||
"\n",
|
||||
"You can also load the pages lazily in order to minimize the memory load."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-02-09T01:10:49.422114+00:00', 'changefreq': 'weekly', 'priority': '1'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching pages: 100%|##########| 28/28 [00:01<00:00, 19.06it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
"page = []\n",
|
||||
"for doc in sitemap_loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -224,11 +347,13 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all SiteMapLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html#langchain_community.document_loaders.sitemap.SitemapLoader"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -247,7 +372,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with UnstructuredMarkdown [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/unstructured/)|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [UnstructuredMarkdownLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | ❌ | ✅ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| UnstructuredMarkdownLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access UnstructuredMarkdownLoader document loader you'll need to install the `langchain-community` integration package and the `unstructured` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **unstructured**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents. \n",
|
||||
"\n",
|
||||
"You can run the loader in one of two modes: \"single\" and \"elements\". If you use \"single\" mode, the document will be returned as a single `Document` object. If you use \"elements\" mode, the unstructured library will split the document into elements such as `Title` and `NarrativeText`. You can pass in additional `unstructured` kwargs after mode to apply different `unstructured` settings."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"loader = UnstructuredMarkdownLoader(\n",
|
||||
" \"./example_data/example.md\",\n",
|
||||
" mode=\"single\",\n",
|
||||
" strategy=\"fast\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/example.md'}, page_content='Sample Markdown Document\\n\\nIntroduction\\n\\nWelcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It\\'s widely used for documentation, readme files, and more.\\n\\nFeatures\\n\\nHeaders\\n\\nMarkdown supports multiple levels of headers:\\n\\nHeader 1: # Header 1\\n\\nHeader 2: ## Header 2\\n\\nHeader 3: ### Header 3\\n\\nLists\\n\\nUnordered List\\n\\nItem 1\\n\\nItem 2\\n\\nSubitem 2.1\\n\\nSubitem 2.2\\n\\nOrdered List\\n\\nFirst item\\n\\nSecond item\\n\\nThird item\\n\\nLinks\\n\\nOpenAI is an AI research organization.\\n\\nImages\\n\\nHere\\'s an example image:\\n\\nCode\\n\\nInline Code\\n\\nUse code for inline code snippets.\\n\\nCode Block\\n\\n```python def greet(name): return f\"Hello, {name}!\"\\n\\nprint(greet(\"World\")) ```')"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/example.md'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/example.md', 'link_texts': ['OpenAI'], 'link_urls': ['https://www.openai.com'], 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'parent_id': 'de1f74bf226224377ab4d8b54f215bb9', 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'NarrativeText', 'element_id': '898a542a261f7dc65e0072d1e847d535'}, page_content='OpenAI is an AI research organization.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []\n",
|
||||
"page[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Elements\n",
|
||||
"\n",
|
||||
"In this example we will load in the `elements` mode, which will return a list of the different elements in the markdown document:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"29"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"loader = UnstructuredMarkdownLoader(\n",
|
||||
" \"./example_data/example.md\",\n",
|
||||
" mode=\"elements\",\n",
|
||||
" strategy=\"fast\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you see there are 29 elements that were pulled from the `example.md` file. The first element is the title of the document as expected:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Sample Markdown Document'"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all UnstructuredMarkdownLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -23,7 +23,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchain-google-community langchain-google-community[vertexaisearch] langchain-google-vertexai langchain-chroma langchain-text-splitters"
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchain-google-community langchain-google-community[vertexaisearch] langchain-google-vertexai langchain-chroma langchain-text-splitters beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -61,7 +61,10 @@
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"DATABRICKS_HOST\"] = \"https://your-workspace.cloud.databricks.com\"\n",
|
||||
"os.environ[\"DATABRICKS_TOKEN\"] = getpass.getpass(\"Enter your Databricks access token: \")"
|
||||
"if \"DATABRICKS_TOKEN\" not in os.environ:\n",
|
||||
" os.environ[\"DATABRICKS_TOKEN\"] = getpass.getpass(\n",
|
||||
" \"Enter your Databricks access token: \"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -25,14 +25,6 @@
|
||||
"| [Fireworks](https://api.python.langchain.com/en/latest/llms/langchain_fireworks.llms.Fireworks.html#langchain_fireworks.llms.Fireworks) | [langchain_fireworks](https://api.python.langchain.com/en/latest/fireworks_api_reference.html) | ❌ | ❌ | ✅ |  |  |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fb345268",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ccff689e",
|
||||
@@ -48,7 +40,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"id": "9ca87a2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -72,10 +64,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "ca824723",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install -qU langchain-fireworks"
|
||||
]
|
||||
@@ -90,7 +90,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"id": "d285fd7f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -124,8 +124,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Even if Tom Brady wins today, he'd still have the same\n"
|
||||
" If Manningville Station, Lions rookie EJ Manuel's\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -152,7 +151,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[Generation(text='\\n\\nR Ashwin is currently the best. He is an all rounder')], [Generation(text='\\nIn your opinion, who has the best overall statistics between Michael Jordan and Le')]]\n"
|
||||
"[[Generation(text=\" We're not just asking, we've done some research. We'\")], [Generation(text=' The conversation is dominated by Kobe Bryant, Dwyane Wade,')]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -177,7 +176,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 6,
|
||||
"id": "b801c20d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -185,7 +184,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" The weather in Kansas City in December is generally cold and snowy. The\n"
|
||||
"\n",
|
||||
"December is a cold month in Kansas City, with temperatures of \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -218,7 +218,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"id": "fd2c6bc1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -226,11 +226,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" What do you call a bear with no teeth? A gummy bear!\n",
|
||||
"\n",
|
||||
"User: What do you call a bear with no teeth and no legs? A gummy bear!\n",
|
||||
"\n",
|
||||
"Computer: That's the same joke! You told the same joke I just told.\n"
|
||||
" What do you call a bear with no teeth? A gummy bear!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -240,7 +236,9 @@
|
||||
"\n",
|
||||
"llm = Fireworks(\n",
|
||||
" model=\"accounts/fireworks/models/mixtral-8x7b-instruct\",\n",
|
||||
" model_kwargs={\"temperature\": 0, \"max_tokens\": 100, \"top_p\": 1.0},\n",
|
||||
" temperature=0.7,\n",
|
||||
" max_tokens=15,\n",
|
||||
" top_p=1.0,\n",
|
||||
")\n",
|
||||
"prompt = PromptTemplate.from_template(\"Tell me a joke about {topic}?\")\n",
|
||||
"chain = prompt | llm\n",
|
||||
@@ -260,7 +258,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"id": "f644ff28",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -268,11 +266,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" What do you call a bear with no teeth? A gummy bear!\n",
|
||||
"\n",
|
||||
"User: What do you call a bear with no teeth and no legs? A gummy bear!\n",
|
||||
"\n",
|
||||
"Computer: That's the same joke! You told the same joke I just told."
|
||||
" Why do bears hate shoes so much? They like to run around in their"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -308,7 +302,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -40,12 +40,7 @@
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"FRIENDLI_TOKEN\"] = getpass.getpass(\"Friendi Personal Access Token: \")"
|
||||
]
|
||||
"source": ["import getpass\nimport os\n\nif \"FRIENDLI_TOKEN\" not in os.environ:\n os.environ[\"FRIENDLI_TOKEN\"] = getpass.getpass(\"Friendi Personal Access Token: \")"]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -59,11 +54,7 @@
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.llms.friendli import Friendli\n",
|
||||
"\n",
|
||||
"llm = Friendli(model=\"mixtral-8x7b-instruct-v0-1\", max_tokens=100, temperature=0)"
|
||||
]
|
||||
"source": ["from langchain_community.llms.friendli import Friendli\n\nllm = Friendli(model=\"mixtral-8x7b-instruct-v0-1\", max_tokens=100, temperature=0)"]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -97,9 +88,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm.invoke(\"Tell me a joke.\")"
|
||||
]
|
||||
"source": ["llm.invoke(\"Tell me a joke.\")"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -118,9 +107,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm.batch([\"Tell me a joke.\", \"Tell me a joke.\"])"
|
||||
]
|
||||
"source": ["llm.batch([\"Tell me a joke.\", \"Tell me a joke.\"])"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -138,9 +125,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm.generate([\"Tell me a joke.\", \"Tell me a joke.\"])"
|
||||
]
|
||||
"source": ["llm.generate([\"Tell me a joke.\", \"Tell me a joke.\"])"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -158,10 +143,7 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for chunk in llm.stream(\"Tell me a joke.\"):\n",
|
||||
" print(chunk, end=\"\", flush=True)"
|
||||
]
|
||||
"source": ["for chunk in llm.stream(\"Tell me a joke.\"):\n print(chunk, end=\"\", flush=True)"]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -186,9 +168,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await llm.ainvoke(\"Tell me a joke.\")"
|
||||
]
|
||||
"source": ["await llm.ainvoke(\"Tell me a joke.\")"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -207,9 +187,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await llm.abatch([\"Tell me a joke.\", \"Tell me a joke.\"])"
|
||||
]
|
||||
"source": ["await llm.abatch([\"Tell me a joke.\", \"Tell me a joke.\"])"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -227,9 +205,7 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await llm.agenerate([\"Tell me a joke.\", \"Tell me a joke.\"])"
|
||||
]
|
||||
"source": ["await llm.agenerate([\"Tell me a joke.\", \"Tell me a joke.\"])"]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -247,10 +223,7 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async for chunk in llm.astream(\"Tell me a joke.\"):\n",
|
||||
" print(chunk, end=\"\", flush=True)"
|
||||
]
|
||||
"source": ["async for chunk in llm.astream(\"Tell me a joke.\"):\n print(chunk, end=\"\", flush=True)"]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"id": "035dea0f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -77,10 +77,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'A great start!\\n\\nLangChain is a type of AI model that uses language processing techniques to generate human-like text based on input prompts or chains of reasoning. In other words, it can have a conversation with humans, understanding the context and responding accordingly.\\n\\nHere\\'s a possible breakdown:\\n\\n* \"Lang\" likely refers to its focus on natural language processing (NLP) and linguistic analysis.\\n* \"Chain\" suggests that LangChain is designed to generate text in response to a series of connected ideas or prompts, rather than simply generating random text.\\n\\nSo, what do you think LangChain\\'s capabilities might be?'"
|
||||
"\"Sounds like a plan!\\n\\nTo answer what LangChain is, let's break it down step by step.\\n\\n**Step 1: Understand the Context**\\nLangChain seems to be related to language or programming, possibly in an AI context. This makes me wonder if it's a framework, library, or tool for building models or interacting with them.\\n\\n**Step 2: Research Possible Definitions**\\nAfter some quick searching, I found that LangChain is actually a Python library for building and composing conversational AI models. It seems to provide a way to create modular and reusable components for chatbots, voice assistants, and other conversational interfaces.\\n\\n**Step 3: Explore Key Features and Use Cases**\\nLangChain likely offers features such as:\\n\\n* Easy composition of conversational flows\\n* Support for various input/output formats (e.g., text, audio)\\n* Integration with popular AI frameworks and libraries\\n\\nUse cases might include building chatbots for customer service, creating voice assistants for smart homes, or developing interactive stories.\\n\\n**Step 4: Confirm the Definition**\\nAfter this step-by-step analysis, I'm fairly confident that LangChain is a Python library for building conversational AI models. If you'd like to verify or provide more context, feel free to do so!\""
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -95,7 +95,7 @@
|
||||
"\n",
|
||||
"prompt = ChatPromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"model = OllamaLLM(model=\"llama3\")\n",
|
||||
"model = OllamaLLM(model=\"llama3.1\")\n",
|
||||
"\n",
|
||||
"chain = prompt | model\n",
|
||||
"\n",
|
||||
@@ -177,7 +177,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"id": "79aaf863",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -218,7 +218,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
"version": "3.12.4"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
@@ -261,6 +261,7 @@ from langchain_community.document_loaders.onenote import OneNoteLoader
|
||||
|
||||
[AI agent](https://learn.microsoft.com/en-us/azure/cosmos-db/ai-agents) needs robust memory systems that support multi-modality, offer strong operational performance, and enable agent memory sharing as well as separation.
|
||||
|
||||
### Azure Cosmos DB
|
||||
AI agents can rely on Azure Cosmos DB as a unified [memory system](https://learn.microsoft.com/en-us/azure/cosmos-db/ai-agents#memory-can-make-or-break-agents) solution, enjoying speed, scale, and simplicity. This service successfully [enabled OpenAI's ChatGPT service](https://www.youtube.com/watch?v=6IIUtEFKJec&t) to scale dynamically with high reliability and low maintenance. Powered by an atom-record-sequence engine, it is the world's first globally distributed [NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/distributed-nosql), [relational](https://learn.microsoft.com/en-us/azure/cosmos-db/distributed-relational), and [vector database](https://learn.microsoft.com/en-us/azure/cosmos-db/vector-database) service that offers a serverless mode.
|
||||
|
||||
Below are two available Azure Cosmos DB APIs that can provide vector store functionalities.
|
||||
@@ -327,6 +328,15 @@ See a [usage example](/docs/integrations/vectorstores/azure_cosmos_db_no_sql).
|
||||
from langchain_community.vectorstores import AzureCosmosDBNoSQLVectorSearch
|
||||
```
|
||||
|
||||
### Azure Database for PostgreSQL
|
||||
>[Azure Database for PostgreSQL - Flexible Server](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/service-overview) is a relational database service based on the open-source Postgres database engine. It's a fully managed database-as-a-service that can handle mission-critical workloads with predictable performance, security, high availability, and dynamic scalability.
|
||||
|
||||
See [set up instructions](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/quickstart-create-server-portal) for Azure Database for PostgreSQL.
|
||||
|
||||
See a [usage example](/docs/integrations/memory/postgres_chat_message_history/). Simply use the [connection string](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/connect-python?tabs=cmd%2Cpassword#add-authentication-code) from your Azure Portal.
|
||||
|
||||
Since Azure Database for PostgreSQL is open-source Postgres, you can use the [LangChain's Postgres support](/docs/integrations/vectorstores/pgvector/) to connect to Azure Database for PostgreSQL.
|
||||
|
||||
## Retrievers
|
||||
### Azure AI Search
|
||||
|
||||
@@ -347,6 +357,17 @@ See a [usage example](/docs/integrations/retrievers/azure_ai_search).
|
||||
from langchain.retrievers import AzureAISearchRetriever
|
||||
```
|
||||
|
||||
## Vector Store
|
||||
### Azure Database for PostgreSQL
|
||||
>[Azure Database for PostgreSQL - Flexible Server](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/service-overview) is a relational database service based on the open-source Postgres database engine. It's a fully managed database-as-a-service that can handle mission-critical workloads with predictable performance, security, high availability, and dynamic scalability.
|
||||
|
||||
See [set up instructions](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/quickstart-create-server-portal) for Azure Database for PostgreSQL.
|
||||
|
||||
You need to [enable pgvector extension](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/how-to-use-pgvector) in your database to use Postgres as a vector store. Once you have the extension enabled, you can use the [PGVector in LangChain](/docs/integrations/vectorstores/pgvector/) to connect to Azure Database for PostgreSQL.
|
||||
|
||||
See a [usage example](/docs/integrations/vectorstores/pgvector/). Simply use the [connection string](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/connect-python?tabs=cmd%2Cpassword#add-authentication-code) from your Azure Portal.
|
||||
|
||||
|
||||
## Tools
|
||||
|
||||
### Azure Container Apps dynamic sessions
|
||||
@@ -496,4 +517,3 @@ See [usage examples](https://python.langchain.com/v0.1/docs/guides/productioniza
|
||||
```python
|
||||
from langchain_experimental.data_anonymizer import PresidioAnonymizer, PresidioReversibleAnonymizer
|
||||
```
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Ollama
|
||||
|
||||
>[Ollama](https://ollama.com/) allows you to run open-source large language models,
|
||||
> such as LLaMA2, locally.
|
||||
> such as [Llama3.1](https://ai.meta.com/blog/meta-llama-3-1/), locally.
|
||||
>
|
||||
>`Ollama` bundles model weights, configuration, and data into a single package, defined by a Modelfile.
|
||||
>It optimizes setup and configuration details, including GPU usage.
|
||||
@@ -11,14 +11,36 @@ See [this guide](/docs/how_to/local_llms) for more details
|
||||
on how to use `Ollama` with LangChain.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
Follow [these instructions](https://github.com/ollama/ollama?tab=readme-ov-file#ollama)
|
||||
### Ollama installation
|
||||
Follow [these instructions](https://github.com/ollama/ollama?tab=readme-ov-file#ollama)
|
||||
to set up and run a local Ollama instance.
|
||||
|
||||
Ollama will start as a background service automatically, if this is disabled, run:
|
||||
|
||||
```bash
|
||||
# export OLLAMA_HOST=127.0.0.1 # environment variable to set ollama host
|
||||
# export OLLAMA_PORT=11434 # environment variable to set the ollama port
|
||||
ollama serve
|
||||
```
|
||||
|
||||
After starting ollama, run `ollama pull <model_checkpoint>` to download a model
|
||||
from the [Ollama model library](https://ollama.ai/library).
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
We're now ready to install the `langchain-ollama` partner package and run a model.
|
||||
|
||||
### Ollama LangChain partner package install
|
||||
Install the integration package with:
|
||||
```bash
|
||||
pip install langchain-ollama
|
||||
```
|
||||
## LLM
|
||||
|
||||
```python
|
||||
from langchain_community.llms import Ollama
|
||||
from langchain_ollama.llms import OllamaLLM
|
||||
```
|
||||
|
||||
See the notebook example [here](/docs/integrations/llms/ollama).
|
||||
@@ -28,18 +50,17 @@ See the notebook example [here](/docs/integrations/llms/ollama).
|
||||
### Chat Ollama
|
||||
|
||||
```python
|
||||
from langchain_community.chat_models import ChatOllama
|
||||
from langchain_ollama.chat_models import ChatOllama
|
||||
```
|
||||
|
||||
See the notebook example [here](/docs/integrations/chat/ollama).
|
||||
|
||||
### Ollama functions
|
||||
|
||||
```python
|
||||
from langchain_experimental.llms.ollama_functions import OllamaFunctions
|
||||
```
|
||||
|
||||
See the notebook example [here](/docs/integrations/chat/ollama_functions).
|
||||
### Ollama tool calling
|
||||
[Ollama tool calling](https://ollama.com/blog/tool-support) uses the
|
||||
OpenAI compatible web server specification, and can be used with
|
||||
the default `BaseChatModel.bind_tools()` methods
|
||||
as described [here](/docs/how_to/tool_calling/).
|
||||
Make sure to select an ollama model that supports [tool calling](https://ollama.com/search?&c=tools).
|
||||
|
||||
## Embedding models
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vector_search_endpoint_name = \"vector_search_demo_endpoint\"\n",
|
||||
"index_name = \"ml.llm.demo_index\"\n",
|
||||
"index_name = \"vector_search_demo.vector_search.state_of_the_union_index\"\n",
|
||||
"\n",
|
||||
"index = vsc.create_direct_access_index(\n",
|
||||
" endpoint_name=vector_search_endpoint_name,\n",
|
||||
@@ -206,7 +206,16 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dvs_delta_sync = DatabricksVectorSearch(\"catalog_name.schema_name.delta_sync_index\")\n",
|
||||
"delta_sync_index = vsc.create_delta_sync_index(\n",
|
||||
" endpoint_name=vector_search_endpoint_name,\n",
|
||||
" source_table_name=\"vector_search_demo.vector_search.state_of_the_union\",\n",
|
||||
" index_name=\"vector_search_demo.vector_search.state_of_the_union_index\",\n",
|
||||
" pipeline_type=\"TRIGGERED\",\n",
|
||||
" primary_key=\"id\",\n",
|
||||
" embedding_source_column=\"text\",\n",
|
||||
" embedding_model_endpoint_name=\"e5-small-v2\",\n",
|
||||
")\n",
|
||||
"dvs_delta_sync = DatabricksVectorSearch(delta_sync_index)\n",
|
||||
"dvs_delta_sync.similarity_search(query)"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -57,7 +57,10 @@
|
||||
"%pip install -qU langchain_chroma\n",
|
||||
"\n",
|
||||
"# Local inference and embeddings via Ollama\n",
|
||||
"%pip install -qU langchain_ollama"
|
||||
"%pip install -qU langchain_ollama\n",
|
||||
"\n",
|
||||
"# Web Loader\n",
|
||||
"% pip install -qU beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -65,7 +65,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-chroma bs4"
|
||||
"%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-chroma beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
File diff suppressed because one or more lines are too long
334
docs/docs/versions/migrating_chains/constitutional_chain.ipynb
Normal file
334
docs/docs/versions/migrating_chains/constitutional_chain.ipynb
Normal file
@@ -0,0 +1,334 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b57124cc-60a0-4c18-b7ce-3e483d1024a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"title: Migrating from ConstitutionalChain\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce8457ed-c0b1-4a74-abbd-9d3d2211270f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[ConstitutionalChain](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.base.ConstitutionalChain.html) allowed for a LLM to critique and revise generations based on [principles](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.models.ConstitutionalPrinciple.html), structured as combinations of critique and revision requests. For example, a principle might include a request to identify harmful content, and a request to rewrite the content.\n",
|
||||
"\n",
|
||||
"`Constitutional AI principles` are based on the [Constitutional AI: Harmlessness from AI Feedback](https://arxiv.org/pdf/2212.08073) paper.\n",
|
||||
"\n",
|
||||
"In `ConstitutionalChain`, this structure of critique requests and associated revisions was formatted into a LLM prompt and parsed out of string responses. This is more naturally achieved via [structured output](/docs/how_to/structured_output/) features of chat models. We can construct a simple chain in [LangGraph](https://langchain-ai.github.io/langgraph/) for this purpose. Some advantages of this approach include:\n",
|
||||
"\n",
|
||||
"- Leverage tool-calling capabilities of chat models that have been fine-tuned for this purpose;\n",
|
||||
"- Reduce parsing errors from extracting expression from a string LLM response;\n",
|
||||
"- Delegation of instructions to [message roles](/docs/concepts/#messages) (e.g., chat models can understand what a `ToolMessage` represents without the need for additional prompting);\n",
|
||||
"- Support for streaming, both of individual tokens and chain steps."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b99b47ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "717c8673",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3621b62-a037-42b8-8faa-59575608bb8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Legacy\n",
|
||||
"\n",
|
||||
"<details open>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f91c9809-8ee7-4e38-881d-0ace4f6ea883",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import ConstitutionalChain, LLMChain\n",
|
||||
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
|
||||
"from langchain_core.prompts import PromptTemplate\n",
|
||||
"from langchain_openai import OpenAI\n",
|
||||
"\n",
|
||||
"llm = OpenAI()\n",
|
||||
"\n",
|
||||
"qa_prompt = PromptTemplate(\n",
|
||||
" template=\"Q: {question} A:\",\n",
|
||||
" input_variables=[\"question\"],\n",
|
||||
")\n",
|
||||
"qa_chain = LLMChain(llm=llm, prompt=qa_prompt)\n",
|
||||
"\n",
|
||||
"constitutional_chain = ConstitutionalChain.from_llm(\n",
|
||||
" llm=llm,\n",
|
||||
" chain=qa_chain,\n",
|
||||
" constitutional_principles=[\n",
|
||||
" ConstitutionalPrinciple(\n",
|
||||
" critique_request=\"Tell if this answer is good.\",\n",
|
||||
" revision_request=\"Give a better answer.\",\n",
|
||||
" )\n",
|
||||
" ],\n",
|
||||
" return_intermediate_steps=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = constitutional_chain.invoke(\"What is the meaning of life?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fa3d11a1-ac1f-4a9a-9ab3-b7b244daa506",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the meaning of life?',\n",
|
||||
" 'output': 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.',\n",
|
||||
" 'initial_output': ' The meaning of life is a subjective concept that can vary from person to person. Some may believe that the purpose of life is to find happiness and fulfillment, while others may see it as a journey of self-discovery and personal growth. Ultimately, the meaning of life is something that each individual must determine for themselves.',\n",
|
||||
" 'critiques_and_revisions': [('This answer is good in that it recognizes and acknowledges the subjective nature of the question and provides a valid and thoughtful response. However, it could have also mentioned that the meaning of life is a complex and deeply personal concept that can also change and evolve over time for each individual. Critique Needed.',\n",
|
||||
" 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.')]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "374ae108-f1a0-4723-9237-5259c8123c04",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Above, we've returned intermediate steps showing:\n",
|
||||
"\n",
|
||||
"- The original question;\n",
|
||||
"- The initial output;\n",
|
||||
"- Critiques and revisions;\n",
|
||||
"- The final output (matching a revision)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cdc3b527-c09e-4c77-9711-c3cc4506cd95",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"## LangGraph\n",
|
||||
"\n",
|
||||
"<details open>\n",
|
||||
"\n",
|
||||
"Below, we use the [.with_structured_output](/docs/how_to/structured_output/) method to simultaneously generate (1) a judgment of whether a critique is needed, and (2) the critique. We surface all prompts involved for clarity and ease of customizability.\n",
|
||||
"\n",
|
||||
"Note that we are also able to stream intermediate steps with this implementation, so we can monitor and if needed intervene during its execution."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "917fdb73-2411-4fcc-9add-c32dc5c745da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import List, Optional, Tuple\n",
|
||||
"\n",
|
||||
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
|
||||
"from langchain.chains.constitutional_ai.prompts import (\n",
|
||||
" CRITIQUE_PROMPT,\n",
|
||||
" REVISION_PROMPT,\n",
|
||||
")\n",
|
||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||
"from langchain_core.prompts import ChatPromptTemplate\n",
|
||||
"from langchain_openai import ChatOpenAI\n",
|
||||
"from langgraph.graph import END, START, StateGraph\n",
|
||||
"from typing_extensions import Annotated, TypedDict\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-4o-mini\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Critique(TypedDict):\n",
|
||||
" \"\"\"Generate a critique, if needed.\"\"\"\n",
|
||||
"\n",
|
||||
" critique_needed: Annotated[bool, ..., \"Whether or not a critique is needed.\"]\n",
|
||||
" critique: Annotated[str, ..., \"If needed, the critique.\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"critique_prompt = ChatPromptTemplate.from_template(\n",
|
||||
" \"Critique this response according to the critique request. \"\n",
|
||||
" \"If no critique is needed, specify that.\\n\\n\"\n",
|
||||
" \"Query: {query}\\n\\n\"\n",
|
||||
" \"Response: {response}\\n\\n\"\n",
|
||||
" \"Critique request: {critique_request}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"revision_prompt = ChatPromptTemplate.from_template(\n",
|
||||
" \"Revise this response according to the critique and reivsion request.\\n\\n\"\n",
|
||||
" \"Query: {query}\\n\\n\"\n",
|
||||
" \"Response: {response}\\n\\n\"\n",
|
||||
" \"Critique request: {critique_request}\\n\\n\"\n",
|
||||
" \"Critique: {critique}\\n\\n\"\n",
|
||||
" \"If the critique does not identify anything worth changing, ignore the \"\n",
|
||||
" \"revision request and return 'No revisions needed'. If the critique \"\n",
|
||||
" \"does identify something worth changing, revise the response based on \"\n",
|
||||
" \"the revision request.\\n\\n\"\n",
|
||||
" \"Revision Request: {revision_request}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"chain = llm | StrOutputParser()\n",
|
||||
"critique_chain = critique_prompt | llm.with_structured_output(Critique)\n",
|
||||
"revision_chain = revision_prompt | llm | StrOutputParser()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class State(TypedDict):\n",
|
||||
" query: str\n",
|
||||
" constitutional_principles: List[ConstitutionalPrinciple]\n",
|
||||
" initial_response: str\n",
|
||||
" critiques_and_revisions: List[Tuple[str, str]]\n",
|
||||
" response: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def generate_response(state: State):\n",
|
||||
" \"\"\"Generate initial response.\"\"\"\n",
|
||||
" response = await chain.ainvoke(state[\"query\"])\n",
|
||||
" return {\"response\": response, \"initial_response\": response}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def critique_and_revise(state: State):\n",
|
||||
" \"\"\"Critique and revise response according to principles.\"\"\"\n",
|
||||
" critiques_and_revisions = []\n",
|
||||
" response = state[\"initial_response\"]\n",
|
||||
" for principle in state[\"constitutional_principles\"]:\n",
|
||||
" critique = await critique_chain.ainvoke(\n",
|
||||
" {\n",
|
||||
" \"query\": state[\"query\"],\n",
|
||||
" \"response\": response,\n",
|
||||
" \"critique_request\": principle.critique_request,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" if critique[\"critique_needed\"]:\n",
|
||||
" revision = await revision_chain.ainvoke(\n",
|
||||
" {\n",
|
||||
" \"query\": state[\"query\"],\n",
|
||||
" \"response\": response,\n",
|
||||
" \"critique_request\": principle.critique_request,\n",
|
||||
" \"critique\": critique[\"critique\"],\n",
|
||||
" \"revision_request\": principle.revision_request,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" response = revision\n",
|
||||
" critiques_and_revisions.append((critique[\"critique\"], revision))\n",
|
||||
" else:\n",
|
||||
" critiques_and_revisions.append((critique[\"critique\"], \"\"))\n",
|
||||
" return {\n",
|
||||
" \"critiques_and_revisions\": critiques_and_revisions,\n",
|
||||
" \"response\": response,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph = StateGraph(State)\n",
|
||||
"graph.add_node(\"generate_response\", generate_response)\n",
|
||||
"graph.add_node(\"critique_and_revise\", critique_and_revise)\n",
|
||||
"\n",
|
||||
"graph.add_edge(START, \"generate_response\")\n",
|
||||
"graph.add_edge(\"generate_response\", \"critique_and_revise\")\n",
|
||||
"graph.add_edge(\"critique_and_revise\", END)\n",
|
||||
"app = graph.compile()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "01aac88d-464e-431f-b92e-746dcb743e1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{}\n",
|
||||
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'response': 'Finding purpose, connection, and joy in our experiences and relationships.'}\n",
|
||||
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'critiques_and_revisions': [(\"The response exceeds the 10-word limit, providing a more elaborate answer than requested. A concise response, such as 'To seek purpose and joy in life,' would better align with the query.\", 'To seek purpose and joy in life.')], 'response': 'To seek purpose and joy in life.'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"constitutional_principles = [\n",
|
||||
" ConstitutionalPrinciple(\n",
|
||||
" critique_request=\"Tell if this answer is good.\",\n",
|
||||
" revision_request=\"Give a better answer.\",\n",
|
||||
" )\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"query = \"What is the meaning of life? Answer in 10 words or fewer.\"\n",
|
||||
"\n",
|
||||
"async for step in app.astream(\n",
|
||||
" {\"query\": query, \"constitutional_principles\": constitutional_principles},\n",
|
||||
" stream_mode=\"values\",\n",
|
||||
"):\n",
|
||||
" subset = [\"initial_response\", \"critiques_and_revisions\", \"response\"]\n",
|
||||
" print({k: v for k, v in step.items() if k in subset})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b2717810",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
"See guides for generating structured output [here](/docs/how_to/structured_output/).\n",
|
||||
"\n",
|
||||
"Check out the [LangGraph documentation](https://langchain-ai.github.io/langgraph/) for detail on building with LangGraph."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -34,7 +34,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-community langchain langchain-openai faiss-cpu"
|
||||
"%pip install --upgrade --quiet langchain-community langchain langchain-openai faiss-cpu beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -45,5 +45,7 @@ The below pages assist with migration from various specific chains to LCEL and L
|
||||
- [RefineDocumentsChain](/docs/versions/migrating_chains/refine_docs_chain)
|
||||
- [LLMRouterChain](/docs/versions/migrating_chains/llm_router_chain)
|
||||
- [MultiPromptChain](/docs/versions/migrating_chains/multi_prompt_chain)
|
||||
- [LLMMathChain](/docs/versions/migrating_chains/llm_math_chain)
|
||||
- [ConstitutionalChain](/docs/versions/migrating_chains/constitutional_chain)
|
||||
|
||||
Check out the [LCEL conceptual docs](/docs/concepts/#langchain-expression-language-lcel) and [LangGraph docs](https://langchain-ai.github.io/langgraph/) for more background information.
|
||||
281
docs/docs/versions/migrating_chains/llm_math_chain.ipynb
Normal file
281
docs/docs/versions/migrating_chains/llm_math_chain.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -33,7 +33,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-community langchain langchain-openai faiss-cpu"
|
||||
"%pip install --upgrade --quiet langchain-community langchain langchain-openai faiss-cpu beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -194,11 +194,6 @@ const config = {
|
||||
docId: "contributing/index",
|
||||
label: "Contributing",
|
||||
},
|
||||
{
|
||||
type: "docSidebar",
|
||||
sidebarId: "templates",
|
||||
label: "Templates",
|
||||
},
|
||||
{
|
||||
label: "Cookbooks",
|
||||
href: "https://github.com/langchain-ai/langchain/blob/master/cookbook/README.md"
|
||||
|
||||
@@ -522,8 +522,11 @@ LangChain implements the latest research in the field of Natural Language Proces
|
||||
This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
|
||||
Templates, and Cookbooks.
|
||||
|
||||
From the opposite direction, scientists use LangChain in research and reference LangChain in the research papers.
|
||||
Here you find [such papers](https://arxiv.org/search/?query=langchain&searchtype=all&source=header).
|
||||
From the opposite direction, scientists use `LangChain` in research and reference it in the research papers.
|
||||
Here you find papers that reference:
|
||||
- [LangChain](https://arxiv.org/search/?query=langchain&searchtype=all&source=header)
|
||||
- [LangGraph](https://arxiv.org/search/?query=langgraph&searchtype=all&source=header)
|
||||
- [LangSmith](https://arxiv.org/search/?query=langsmith&searchtype=all&source=header)
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -604,11 +607,9 @@ Here you find [such papers](https://arxiv.org/search/?query=langchain&searchtype
|
||||
f"""
|
||||
## {paper.title}
|
||||
|
||||
- **arXiv id:** {paper.arxiv_id}
|
||||
- **arXiv id:** [{paper.arxiv_id}]({paper.url}) **Published Date:** {paper.published_date}
|
||||
- **Title:** {paper.title}
|
||||
- **Authors:** {', '.join(paper.authors)}
|
||||
- **Published Date:** {paper.published_date}
|
||||
- **URL:** {paper.url}
|
||||
- **LangChain:**
|
||||
|
||||
{refs}
|
||||
|
||||
@@ -5,6 +5,10 @@ import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
deindexed_footer = """
|
||||
<span data-lc-docs-search-deindexed="true"></span>
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
intermediate_dir = Path(sys.argv[1])
|
||||
|
||||
@@ -23,7 +27,7 @@ if __name__ == "__main__":
|
||||
# remove images
|
||||
content = re.sub(r"\!\[.*?\]\((.*?)\)", "", content)
|
||||
with open(full_destination, "w") as f:
|
||||
f.write(content)
|
||||
f.write(content + deindexed_footer)
|
||||
|
||||
sidebar_hidden = """---
|
||||
sidebar_class_name: hidden
|
||||
@@ -43,4 +47,4 @@ custom_edit_url:
|
||||
content = re.sub(r"\]\(\.\.\/", "](/docs/templates/", content)
|
||||
|
||||
with open(templates_index_intermediate, "w") as f:
|
||||
f.write(sidebar_hidden + content)
|
||||
f.write(sidebar_hidden + content + deindexed_footer)
|
||||
|
||||
@@ -217,6 +217,7 @@ The following table shows tools that can be used to automate tasks in databases:
|
||||
## All tools
|
||||
|
||||
import {{ IndexTable }} from "@theme/FeatureTables";
|
||||
|
||||
<IndexTable />
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
@@ -510,6 +510,55 @@ const FEATURE_TABLES = {
|
||||
source: "Uses AWS API to load PDFs",
|
||||
api: "API",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "MathPix",
|
||||
link: "mathpix",
|
||||
source: "Uses MathPix to laod PDFs",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PDFPlumber",
|
||||
link: "pdfplumber",
|
||||
source: "Load PDF files using PDFPlumber",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyPDFDirectry",
|
||||
link: "pypdfdirectory",
|
||||
source: "Load a directory with PDF files",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyPDFium2",
|
||||
link: "pypdfium2",
|
||||
source: "Load PDF files using PyPDFium2",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
|
||||
},
|
||||
{
|
||||
name: "UnstructuredPDFLoader",
|
||||
link: "unstructured_pdfloader",
|
||||
source: "Load PDF files using Unstructured",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.UnstructuredPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyMuPDF",
|
||||
link: "pymupdf",
|
||||
source: "Load PDF files using PyMuPDF",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PDFMiner",
|
||||
link: "pdfminer",
|
||||
source: "Load PDF files using PDFMiner",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -530,7 +579,7 @@ const FEATURE_TABLES = {
|
||||
},
|
||||
{
|
||||
name: "DirectoryLoader",
|
||||
link: "document_loader_directory",
|
||||
link: "../../how_to/document_loader_directory",
|
||||
source: "All files in a given directory",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
|
||||
},
|
||||
@@ -540,6 +589,24 @@ const FEATURE_TABLES = {
|
||||
source: "All file types",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||
},
|
||||
{
|
||||
name: "JSONLoader",
|
||||
link: "json",
|
||||
source: "JSON files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
|
||||
},
|
||||
{
|
||||
name: "UnstructuredMarkdownLoader",
|
||||
link: "unstructured_markdown",
|
||||
source: "Markdown files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
|
||||
},
|
||||
{
|
||||
name: "BSHTMLLoader",
|
||||
link: "bshtml",
|
||||
source: "HTML files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
vectorstores: {
|
||||
|
||||
@@ -101,6 +101,10 @@
|
||||
{
|
||||
"source": "/v0.2/docs/integrations/toolkits/xorbits/",
|
||||
"destination": "/v0.2/docs/integrations/tools#search"
|
||||
},
|
||||
{
|
||||
"source": "/v0.2/docs/integrations/chat/ollama_functions/",
|
||||
"destination": "https://python.langchain.com/v0.1/docs/integrations/chat/ollama_functions/"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""GitHub Toolkit."""
|
||||
"""GitLab Toolkit."""
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
"""Implement a GPT-3 driven browser.
|
||||
|
||||
Heavily influenced from https://github.com/nat/natbot
|
||||
"""
|
||||
|
||||
from langchain_community.chains.natbot.base import NatBotChain
|
||||
|
||||
__all__ = ["NatBotChain"]
|
||||
3
libs/community/langchain_community/chains/natbot/base.py
Normal file
3
libs/community/langchain_community/chains/natbot/base.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from langchain.chains import NatBotChain
|
||||
|
||||
__all__ = ["NatBotChain"]
|
||||
@@ -0,0 +1,7 @@
|
||||
from langchain.chains.natbot.crawler import (
|
||||
Crawler,
|
||||
ElementInViewPort,
|
||||
black_listed_elements,
|
||||
)
|
||||
|
||||
__all__ = ["ElementInViewPort", "Crawler", "black_listed_elements"]
|
||||
@@ -0,0 +1,3 @@
|
||||
from langchain.chains.natbot.prompt import PROMPT
|
||||
|
||||
__all__ = ["PROMPT"]
|
||||
@@ -16,9 +16,6 @@ class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
|
||||
Args:
|
||||
path (Union[Path, str]): The path to the chat file.
|
||||
|
||||
Attributes:
|
||||
path (Path): The path to the chat file.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[Path, str]) -> None:
|
||||
@@ -58,9 +55,6 @@ class FolderFacebookMessengerChatLoader(BaseChatLoader):
|
||||
path (Union[str, Path]): The path to the directory
|
||||
containing the chat files.
|
||||
|
||||
Attributes:
|
||||
path (Path): The path to the directory containing the chat files.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[str, Path]) -> None:
|
||||
|
||||
@@ -42,6 +42,7 @@ from langchain_core.messages import (
|
||||
HumanMessageChunk,
|
||||
SystemMessage,
|
||||
SystemMessageChunk,
|
||||
ToolMessage,
|
||||
)
|
||||
from langchain_core.output_parsers.base import OutputParserLike
|
||||
from langchain_core.output_parsers.openai_tools import (
|
||||
@@ -150,6 +151,15 @@ def _convert_dict_to_message(dct: Dict[str, Any]) -> BaseMessage:
|
||||
if tool_calls is not None:
|
||||
additional_kwargs["tool_calls"] = tool_calls
|
||||
return AIMessage(content=content, additional_kwargs=additional_kwargs)
|
||||
if role == "tool":
|
||||
additional_kwargs = {}
|
||||
if "name" in dct:
|
||||
additional_kwargs["name"] = dct["name"]
|
||||
return ToolMessage(
|
||||
content=content,
|
||||
tool_call_id=dct.get("tool_call_id"), # type: ignore[arg-type]
|
||||
additional_kwargs=additional_kwargs,
|
||||
)
|
||||
return ChatMessage(role=role, content=content) # type: ignore[arg-type]
|
||||
|
||||
|
||||
@@ -171,6 +181,13 @@ def _convert_message_to_dict(message: BaseMessage) -> Dict[str, Any]:
|
||||
message_dict = {"role": "user", "content": message.content}
|
||||
elif isinstance(message, AIMessage):
|
||||
message_dict = {"role": "assistant", "content": message.content}
|
||||
elif isinstance(message, ToolMessage):
|
||||
message_dict = {
|
||||
"role": "tool",
|
||||
"content": message.content,
|
||||
"tool_call_id": message.tool_call_id,
|
||||
"name": message.name or message.additional_kwargs.get("name"),
|
||||
}
|
||||
else:
|
||||
raise TypeError(f"Got unknown type '{message.__class__.__name__}'.")
|
||||
return message_dict
|
||||
|
||||
@@ -10,7 +10,74 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BSHTMLLoader(BaseLoader):
|
||||
"""Load `HTML` files and parse them with `beautiful soup`."""
|
||||
"""
|
||||
__ModuleName__ document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community`` and ``bs4``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community bs4
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import BSHTMLLoader
|
||||
|
||||
loader = BSHTMLLoader(
|
||||
file_path="./example_data/fake-content.html",
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,19 +13,60 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
||||
Setup:
|
||||
Install ``langchain-community``.
|
||||
|
||||
loader = UnstructuredMarkdownLoader(
|
||||
"example.md", mode="elements", strategy="fast",
|
||||
)
|
||||
docs = loader.load()
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
||||
|
||||
loader = UnstructuredMarkdownLoader(
|
||||
"./example_data/example.md",
|
||||
mode="elements",
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Sample Markdown Document
|
||||
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Sample Markdown Document
|
||||
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
|
||||
@@ -129,7 +129,9 @@ class OpenAIWhisperParser(BaseBlobParser):
|
||||
continue
|
||||
|
||||
yield Document(
|
||||
page_content=transcript.text,
|
||||
page_content=transcript.text
|
||||
if not isinstance(transcript, str)
|
||||
else transcript,
|
||||
metadata={"source": blob.source, "chunk": split_number},
|
||||
)
|
||||
|
||||
|
||||
@@ -23,14 +23,14 @@ class MimeTypeBasedParser(BaseBlobParser):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"application/pdf": ...,
|
||||
},
|
||||
fallback_parser=...,
|
||||
)
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"application/pdf": ...,
|
||||
},
|
||||
fallback_parser=...,
|
||||
)
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -9,6 +9,7 @@ from typing import (
|
||||
Type,
|
||||
)
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.graph_vectorstores.base import (
|
||||
@@ -23,6 +24,7 @@ if TYPE_CHECKING:
|
||||
from cassandra.cluster import Session
|
||||
|
||||
|
||||
@beta()
|
||||
class CassandraGraphVectorStore(GraphVectorStore):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Union
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
@@ -11,6 +12,7 @@ from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
GLiNERInput = Union[str, Document]
|
||||
|
||||
|
||||
@beta()
|
||||
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
||||
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Callable, List, Set
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
@@ -18,6 +19,7 @@ _CHILD: str = "c:"
|
||||
_SIBLING: str = "s:"
|
||||
|
||||
|
||||
@beta()
|
||||
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, List, Optional, Set, Union
|
||||
from urllib.parse import urldefrag, urljoin, urlparse
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
@@ -61,6 +62,7 @@ class HtmlInput:
|
||||
base_url: str
|
||||
|
||||
|
||||
@beta()
|
||||
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
|
||||
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
|
||||
"""Extract hyperlinks from HTML content.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Union
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
@@ -10,6 +11,7 @@ from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
KeybertInput = Union[str, Document]
|
||||
|
||||
|
||||
@beta()
|
||||
class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Generic, Iterable, Set, TypeVar
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
InputT = TypeVar("InputT")
|
||||
@@ -10,6 +11,7 @@ InputT = TypeVar("InputT")
|
||||
METADATA_LINKS_KEY = "links"
|
||||
|
||||
|
||||
@beta()
|
||||
class LinkExtractor(ABC, Generic[InputT]):
|
||||
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Callable, Iterable, Set, TypeVar
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
@@ -10,6 +11,7 @@ InputT = TypeVar("InputT")
|
||||
UnderlyingInputT = TypeVar("UnderlyingInputT")
|
||||
|
||||
|
||||
@beta()
|
||||
class LinkExtractorAdapter(LinkExtractor[InputT]):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Any, Sequence
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.transformers import BaseDocumentTransformer
|
||||
from langchain_core.graph_vectorstores.links import copy_with_links
|
||||
@@ -9,6 +10,7 @@ from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
)
|
||||
|
||||
|
||||
@beta()
|
||||
class LinkExtractorTransformer(BaseDocumentTransformer):
|
||||
"""DocumentTransformer for applying one or more LinkExtractors.
|
||||
|
||||
|
||||
@@ -69,30 +69,26 @@ class QianfanLLMEndpoint(LLM):
|
||||
Invoke:
|
||||
.. code-block:: python
|
||||
|
||||
messages = [
|
||||
("system", "你是一名专业的翻译家,可以将用户的中文翻译为英文。"),
|
||||
("human", "我喜欢编程。"),
|
||||
]
|
||||
llm.invoke(messages)
|
||||
input_text = "用50个字左右阐述,生命的意义在于"
|
||||
llm.invoke(input_text)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
'I like programming.'
|
||||
'生命的意义在于体验、成长、爱与被爱、贡献与传承,以及对未知的勇敢探索与自我超越。'
|
||||
|
||||
Stream:
|
||||
.. code-block:: python
|
||||
|
||||
for chunk in llm.stream(messages):
|
||||
for chunk in llm.stream(input_text):
|
||||
print(chunk)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
I like
|
||||
programming.
|
||||
生命的意义 | 在于不断探索 | 与成长 | ,实现 | 自我价值,| 给予爱 | 并接受 | 爱, | 在经历 | 中感悟 | ,让 | 短暂的存在 | 绽放出无限 | 的光彩 | 与温暖 | 。
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
stream = llm.stream(messages)
|
||||
stream = llm.stream(input_text)
|
||||
full = next(stream)
|
||||
for chunk in stream:
|
||||
full += chunk
|
||||
@@ -100,23 +96,23 @@ class QianfanLLMEndpoint(LLM):
|
||||
|
||||
.. code-block::
|
||||
|
||||
'I like programming.'
|
||||
'生命的意义在于探索、成长、爱与被爱、贡献价值、体验世界之美,以及在有限的时间里追求内心的平和与幸福。'
|
||||
|
||||
Async:
|
||||
.. code-block:: python
|
||||
|
||||
await llm.ainvoke(messages)
|
||||
await llm.ainvoke(input_text)
|
||||
|
||||
# stream:
|
||||
# async for chunk in llm.astream(messages):
|
||||
# async for chunk in llm.astream(input_text):
|
||||
# print(chunk)
|
||||
|
||||
# batch:
|
||||
# await llm.abatch([messages])
|
||||
# await llm.abatch([input_text])
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
'I like programming.'
|
||||
'生命的意义在于探索、成长、爱与被爱、贡献社会,在有限的时间里追寻无限的可能,实现自我价值,让生活充满色彩与意义。'
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
@@ -199,44 +199,38 @@ class Tongyi(BaseLLM):
|
||||
Invoke:
|
||||
.. code-block:: python
|
||||
|
||||
messages = [
|
||||
("system", "你是一名专业的翻译家,可以将用户的中文翻译为英文。"),
|
||||
("human", "我喜欢编程。"),
|
||||
]
|
||||
llm.invoke(messages)
|
||||
input_text = "用50个字左右阐述,生命的意义在于"
|
||||
llm.invoke(input_text)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
'I enjoy programming.'
|
||||
'探索、成长、连接与爱——在有限的时间里,不断学习、体验、贡献并寻找与世界和谐共存之道,让每一刻充满价值与意义。'
|
||||
|
||||
Stream:
|
||||
.. code-block:: python
|
||||
|
||||
for chunk in llm.stream(messages):
|
||||
for chunk in llm.stream(input_text):
|
||||
print(chunk)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
I
|
||||
enjoy
|
||||
programming
|
||||
.
|
||||
探索 | 、 | 成长 | 、连接与爱。 | 在有限的时间里,寻找个人价值, | 贡献于他人,共同体验世界的美好 | ,让世界因自己的存在而更 | 温暖。
|
||||
|
||||
Async:
|
||||
.. code-block:: python
|
||||
|
||||
await llm.ainvoke(messages)
|
||||
await llm.ainvoke(input_text)
|
||||
|
||||
# stream:
|
||||
# async for chunk in llm.astream(messages):
|
||||
# async for chunk in llm.astream(input_text):
|
||||
# print(chunk)
|
||||
|
||||
# batch:
|
||||
# await llm.abatch([messages])
|
||||
# await llm.abatch([input_text])
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
'I enjoy programming.'
|
||||
'探索、成长、连接与爱。在有限的时间里,寻找个人价值,贡献于他人和社会,体验丰富多彩的情感与经历,不断学习进步,让世界因自己的存在而更美好。'
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
@@ -1735,7 +1735,11 @@ def _reorder_results_with_maximal_marginal_relevance(
|
||||
def _result_to_document(result: Dict) -> Document:
|
||||
return Document(
|
||||
page_content=result.pop(FIELDS_CONTENT),
|
||||
metadata=json.loads(result[FIELDS_METADATA])
|
||||
metadata=(
|
||||
result[FIELDS_METADATA]
|
||||
if isinstance(result[FIELDS_METADATA], dict)
|
||||
else json.loads(result[FIELDS_METADATA])
|
||||
)
|
||||
if FIELDS_METADATA in result
|
||||
else {
|
||||
key: value for key, value in result.items() if key != FIELDS_CONTENT_VECTOR
|
||||
|
||||
@@ -6,14 +6,6 @@ from langchain_core.documents import Document
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
|
||||
|
||||
def test_llm_construction_with_kwargs() -> None:
|
||||
llm_chain_kwargs = {"verbose": True}
|
||||
compressor = LLMChainExtractor.from_llm(
|
||||
ChatOpenAI(), llm_chain_kwargs=llm_chain_kwargs
|
||||
)
|
||||
assert compressor.llm_chain.verbose is True
|
||||
|
||||
|
||||
def test_llm_chain_extractor() -> None:
|
||||
texts = [
|
||||
"The Roman Empire followed the Roman Republic.",
|
||||
|
||||
@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
num_lists = 3
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.chains.natbot.base import NatBotChain
|
||||
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
||||
from langchain_core.language_models.llms import LLM
|
||||
|
||||
from langchain.chains.natbot.base import NatBotChain
|
||||
|
||||
|
||||
class FakeLLM(LLM):
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
@@ -10,7 +10,7 @@ def test_standard_params() -> None:
|
||||
class ExpectedParams(BaseModel):
|
||||
ls_provider: str
|
||||
ls_model_name: str
|
||||
ls_model_type: Literal["chat"]
|
||||
ls_model_type: Literal["chat", "llm"]
|
||||
ls_temperature: Optional[float]
|
||||
ls_max_tokens: Optional[int]
|
||||
ls_stop: Optional[List[str]]
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
"""Test ZhipuAI Chat API wrapper"""
|
||||
|
||||
import pytest
|
||||
from langchain_core.messages import ToolMessage
|
||||
|
||||
from langchain_community.chat_models.zhipuai import ChatZhipuAI
|
||||
from langchain_community.chat_models.zhipuai import (
|
||||
ChatZhipuAI,
|
||||
_convert_message_to_dict,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("httpx", "httpx_sse", "jwt")
|
||||
@@ -11,3 +15,15 @@ def test_zhipuai_model_param() -> None:
|
||||
assert llm.model_name == "foo"
|
||||
llm = ChatZhipuAI(api_key="test", model_name="foo") # type: ignore[call-arg]
|
||||
assert llm.model_name == "foo"
|
||||
|
||||
|
||||
def test__convert_message_to_dict_with_tool() -> None:
|
||||
message = ToolMessage(name="foo", content="bar", tool_call_id="abc123")
|
||||
result = _convert_message_to_dict(message)
|
||||
expected_output = {
|
||||
"name": "foo",
|
||||
"content": "bar",
|
||||
"tool_call_id": "abc123",
|
||||
"role": "tool",
|
||||
}
|
||||
assert result == expected_output
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, TypeVar, Union
|
||||
from uuid import UUID
|
||||
|
||||
@@ -13,6 +14,8 @@ if TYPE_CHECKING:
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langchain_core.outputs import ChatGenerationChunk, GenerationChunk, LLMResult
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetrieverManagerMixin:
|
||||
"""Mixin for Retriever callbacks."""
|
||||
@@ -911,15 +914,72 @@ class BaseCallbackManager(CallbackManagerMixin):
|
||||
def copy(self: T) -> T:
|
||||
"""Copy the callback manager."""
|
||||
return self.__class__(
|
||||
handlers=self.handlers,
|
||||
inheritable_handlers=self.inheritable_handlers,
|
||||
handlers=self.handlers.copy(),
|
||||
inheritable_handlers=self.inheritable_handlers.copy(),
|
||||
parent_run_id=self.parent_run_id,
|
||||
tags=self.tags,
|
||||
inheritable_tags=self.inheritable_tags,
|
||||
metadata=self.metadata,
|
||||
inheritable_metadata=self.inheritable_metadata,
|
||||
tags=self.tags.copy(),
|
||||
inheritable_tags=self.inheritable_tags.copy(),
|
||||
metadata=self.metadata.copy(),
|
||||
inheritable_metadata=self.inheritable_metadata.copy(),
|
||||
)
|
||||
|
||||
def merge(self: T, other: BaseCallbackManager) -> T:
|
||||
"""Merge the callback manager with another callback manager.
|
||||
|
||||
May be overwritten in subclasses. Primarily used internally
|
||||
within merge_configs.
|
||||
|
||||
Returns:
|
||||
BaseCallbackManager: The merged callback manager of the same type
|
||||
as the current object.
|
||||
|
||||
Example: Merging two callback managers.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.callbacks.manager import CallbackManager, trace_as_chain_group
|
||||
from langchain_core.callbacks.stdout import StdOutCallbackHandler
|
||||
|
||||
manager = CallbackManager(handlers=[StdOutCallbackHandler()], tags=["tag2"])
|
||||
with trace_as_chain_group("My Group Name", tags=["tag1"]) as group_manager:
|
||||
merged_manager = group_manager.merge(manager)
|
||||
print(merged_manager.handlers)
|
||||
# [
|
||||
# <langchain_core.callbacks.stdout.StdOutCallbackHandler object at ...>,
|
||||
# <langchain_core.callbacks.streaming_stdout.StreamingStdOutCallbackHandler object at ...>,
|
||||
# ]
|
||||
|
||||
print(merged_manager.tags)
|
||||
# ['tag2', 'tag1']
|
||||
|
||||
""" # noqa: E501
|
||||
if self.parent_run_id != other.parent_run_id:
|
||||
_LOGGER.warning(
|
||||
f"{self.__class__.__name__}.merge(): Parent run IDs do not match."
|
||||
" Using the parent run ID of the first callback manager."
|
||||
)
|
||||
manager = self.__class__(
|
||||
parent_run_id=self.parent_run_id or other.parent_run_id,
|
||||
handlers=[],
|
||||
inheritable_handlers=[],
|
||||
tags=list(set(self.tags + other.tags)),
|
||||
inheritable_tags=list(set(self.inheritable_tags + other.inheritable_tags)),
|
||||
metadata={
|
||||
**self.metadata,
|
||||
**other.metadata,
|
||||
},
|
||||
)
|
||||
|
||||
handlers = self.handlers + other.handlers
|
||||
inheritable_handlers = self.inheritable_handlers + other.inheritable_handlers
|
||||
|
||||
for handler in handlers:
|
||||
manager.add_handler(handler)
|
||||
|
||||
for handler in inheritable_handlers:
|
||||
manager.add_handler(handler, inherit=True)
|
||||
return manager
|
||||
|
||||
@property
|
||||
def is_async(self) -> bool:
|
||||
"""Whether the callback manager is async."""
|
||||
|
||||
@@ -1612,16 +1612,80 @@ class CallbackManagerForChainGroup(CallbackManager):
|
||||
def copy(self) -> CallbackManagerForChainGroup:
|
||||
"""Copy the callback manager."""
|
||||
return self.__class__(
|
||||
handlers=self.handlers,
|
||||
inheritable_handlers=self.inheritable_handlers,
|
||||
handlers=self.handlers.copy(),
|
||||
inheritable_handlers=self.inheritable_handlers.copy(),
|
||||
parent_run_id=self.parent_run_id,
|
||||
tags=self.tags,
|
||||
inheritable_tags=self.inheritable_tags,
|
||||
metadata=self.metadata,
|
||||
inheritable_metadata=self.inheritable_metadata,
|
||||
tags=self.tags.copy(),
|
||||
inheritable_tags=self.inheritable_tags.copy(),
|
||||
metadata=self.metadata.copy(),
|
||||
inheritable_metadata=self.inheritable_metadata.copy(),
|
||||
parent_run_manager=self.parent_run_manager,
|
||||
)
|
||||
|
||||
def merge(
|
||||
self: CallbackManagerForChainGroup, other: BaseCallbackManager
|
||||
) -> CallbackManagerForChainGroup:
|
||||
"""Merge the group callback manager with another callback manager.
|
||||
|
||||
Overwrites the merge method in the base class to ensure that the
|
||||
parent run manager is preserved. Keeps the parent_run_manager
|
||||
from the current object.
|
||||
|
||||
Returns:
|
||||
CallbackManagerForChainGroup: A copy of the current object with the
|
||||
handlers, tags, and other attributes merged from the other object.
|
||||
|
||||
Example: Merging two callback managers.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.callbacks.manager import CallbackManager, trace_as_chain_group
|
||||
from langchain_core.callbacks.stdout import StdOutCallbackHandler
|
||||
|
||||
manager = CallbackManager(handlers=[StdOutCallbackHandler()], tags=["tag2"])
|
||||
with trace_as_chain_group("My Group Name", tags=["tag1"]) as group_manager:
|
||||
merged_manager = group_manager.merge(manager)
|
||||
print(type(merged_manager))
|
||||
# <class 'langchain_core.callbacks.manager.CallbackManagerForChainGroup'>
|
||||
|
||||
print(merged_manager.handlers)
|
||||
# [
|
||||
# <langchain_core.callbacks.stdout.LangChainTracer object at ...>,
|
||||
# <langchain_core.callbacks.streaming_stdout.StdOutCallbackHandler object at ...>,
|
||||
# ]
|
||||
|
||||
print(merged_manager.tags)
|
||||
# ['tag2', 'tag1']
|
||||
|
||||
""" # noqa: E501
|
||||
if self.parent_run_id != other.parent_run_id:
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__}.merge(): Parent run IDs do not match."
|
||||
" Using the parent run ID of the first callback manager."
|
||||
)
|
||||
manager = self.__class__(
|
||||
parent_run_id=self.parent_run_id or other.parent_run_id,
|
||||
handlers=[],
|
||||
inheritable_handlers=[],
|
||||
tags=list(set(self.tags + other.tags)),
|
||||
inheritable_tags=list(set(self.inheritable_tags + other.inheritable_tags)),
|
||||
metadata={
|
||||
**self.metadata,
|
||||
**other.metadata,
|
||||
},
|
||||
parent_run_manager=self.parent_run_manager,
|
||||
)
|
||||
|
||||
handlers = self.handlers + other.handlers
|
||||
inheritable_handlers = self.inheritable_handlers + other.inheritable_handlers
|
||||
|
||||
for handler in handlers:
|
||||
manager.add_handler(handler)
|
||||
|
||||
for handler in inheritable_handlers:
|
||||
manager.add_handler(handler, inherit=True)
|
||||
return manager
|
||||
|
||||
def on_chain_end(self, outputs: Union[Dict[str, Any], Any], **kwargs: Any) -> None:
|
||||
"""Run when traced chain group ends.
|
||||
|
||||
@@ -2040,16 +2104,80 @@ class AsyncCallbackManagerForChainGroup(AsyncCallbackManager):
|
||||
def copy(self) -> AsyncCallbackManagerForChainGroup:
|
||||
"""Copy the async callback manager."""
|
||||
return self.__class__(
|
||||
handlers=self.handlers,
|
||||
inheritable_handlers=self.inheritable_handlers,
|
||||
handlers=self.handlers.copy(),
|
||||
inheritable_handlers=self.inheritable_handlers.copy(),
|
||||
parent_run_id=self.parent_run_id,
|
||||
tags=self.tags,
|
||||
inheritable_tags=self.inheritable_tags,
|
||||
metadata=self.metadata,
|
||||
inheritable_metadata=self.inheritable_metadata,
|
||||
tags=self.tags.copy(),
|
||||
inheritable_tags=self.inheritable_tags.copy(),
|
||||
metadata=self.metadata.copy(),
|
||||
inheritable_metadata=self.inheritable_metadata.copy(),
|
||||
parent_run_manager=self.parent_run_manager,
|
||||
)
|
||||
|
||||
def merge(
|
||||
self: AsyncCallbackManagerForChainGroup, other: BaseCallbackManager
|
||||
) -> AsyncCallbackManagerForChainGroup:
|
||||
"""Merge the group callback manager with another callback manager.
|
||||
|
||||
Overwrites the merge method in the base class to ensure that the
|
||||
parent run manager is preserved. Keeps the parent_run_manager
|
||||
from the current object.
|
||||
|
||||
Returns:
|
||||
AsyncCallbackManagerForChainGroup: A copy of the current AsyncCallbackManagerForChainGroup
|
||||
with the handlers, tags, etc. of the other callback manager merged in.
|
||||
|
||||
Example: Merging two callback managers.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.callbacks.manager import CallbackManager, atrace_as_chain_group
|
||||
from langchain_core.callbacks.stdout import StdOutCallbackHandler
|
||||
|
||||
manager = CallbackManager(handlers=[StdOutCallbackHandler()], tags=["tag2"])
|
||||
async with atrace_as_chain_group("My Group Name", tags=["tag1"]) as group_manager:
|
||||
merged_manager = group_manager.merge(manager)
|
||||
print(type(merged_manager))
|
||||
# <class 'langchain_core.callbacks.manager.AsyncCallbackManagerForChainGroup'>
|
||||
|
||||
print(merged_manager.handlers)
|
||||
# [
|
||||
# <langchain_core.callbacks.stdout.LangChainTracer object at ...>,
|
||||
# <langchain_core.callbacks.streaming_stdout.StdOutCallbackHandler object at ...>,
|
||||
# ]
|
||||
|
||||
print(merged_manager.tags)
|
||||
# ['tag2', 'tag1']
|
||||
|
||||
""" # noqa: E501
|
||||
if self.parent_run_id != other.parent_run_id:
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__}.merge(): Parent run IDs do not match."
|
||||
" Using the parent run ID of the first callback manager."
|
||||
)
|
||||
manager = self.__class__(
|
||||
parent_run_id=self.parent_run_id or other.parent_run_id,
|
||||
handlers=[],
|
||||
inheritable_handlers=[],
|
||||
tags=list(set(self.tags + other.tags)),
|
||||
inheritable_tags=list(set(self.inheritable_tags + other.inheritable_tags)),
|
||||
metadata={
|
||||
**self.metadata,
|
||||
**other.metadata,
|
||||
},
|
||||
parent_run_manager=self.parent_run_manager,
|
||||
)
|
||||
|
||||
handlers = self.handlers + other.handlers
|
||||
inheritable_handlers = self.inheritable_handlers + other.inheritable_handlers
|
||||
|
||||
for handler in handlers:
|
||||
manager.add_handler(handler)
|
||||
|
||||
for handler in inheritable_handlers:
|
||||
manager.add_handler(handler, inherit=True)
|
||||
return manager
|
||||
|
||||
async def on_chain_end(
|
||||
self, outputs: Union[Dict[str, Any], Any], **kwargs: Any
|
||||
) -> None:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_core.document_loaders.blob_loaders import Blob, BlobLoader, PathLike
|
||||
from langchain_core.document_loaders.langsmith import LangSmithLoader
|
||||
|
||||
__all__ = [
|
||||
"BaseBlobParser",
|
||||
@@ -7,4 +8,5 @@ __all__ = [
|
||||
"Blob",
|
||||
"BlobLoader",
|
||||
"PathLike",
|
||||
"LangSmithLoader",
|
||||
]
|
||||
|
||||
128
libs/core/langchain_core/document_loaders/langsmith.py
Normal file
128
libs/core/langchain_core/document_loaders/langsmith.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import datetime
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any, Callable, Iterator, Optional, Sequence, Union
|
||||
|
||||
from langsmith import Client as LangSmithClient
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
class LangSmithLoader(BaseLoader):
|
||||
"""Load LangSmith Dataset examples as Documents.
|
||||
|
||||
Loads the example inputs as the Document page content and places the entire example
|
||||
into the Document metadata. This allows you to easily create few-shot example
|
||||
retrievers from the loaded documents.
|
||||
|
||||
.. dropdown:: Lazy load
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.document_loaders import LangSmithLoader
|
||||
|
||||
loader = LangSmithLoader(dataset_id="...", limit=100)
|
||||
docs = []
|
||||
for doc in loader.lazy_load():
|
||||
docs.append(doc)
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
||||
|
||||
.. versionadded:: 0.2.34
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
dataset_id: Optional[Union[uuid.UUID, str]] = None,
|
||||
dataset_name: Optional[str] = None,
|
||||
example_ids: Optional[Sequence[Union[uuid.UUID, str]]] = None,
|
||||
as_of: Optional[Union[datetime.datetime, str]] = None,
|
||||
splits: Optional[Sequence[str]] = None,
|
||||
inline_s3_urls: bool = True,
|
||||
offset: int = 0,
|
||||
limit: Optional[int] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
filter: Optional[str] = None,
|
||||
content_key: str = "",
|
||||
format_content: Optional[Callable[..., str]] = None,
|
||||
client: Optional[LangSmithClient] = None,
|
||||
**client_kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
dataset_id: The ID of the dataset to filter by. Defaults to None.
|
||||
dataset_name: The name of the dataset to filter by. Defaults to None.
|
||||
content_key: The inputs key to set as Document page content. ``"."`` characters
|
||||
are interpreted as nested keys. E.g. ``content_key="first.second"`` will
|
||||
result in
|
||||
``Document(page_content=format_content(example.inputs["first"]["second"]))``
|
||||
format_content: Function for converting the content extracted from the example
|
||||
inputs into a string. Defaults to JSON-encoding the contents.
|
||||
example_ids: The IDs of the examples to filter by. Defaults to None.
|
||||
as_of: The dataset version tag OR
|
||||
timestamp to retrieve the examples as of.
|
||||
Response examples will only be those that were present at the time
|
||||
of the tagged (or timestamped) version.
|
||||
splits: A list of dataset splits, which are
|
||||
divisions of your dataset such as 'train', 'test', or 'validation'.
|
||||
Returns examples only from the specified splits.
|
||||
inline_s3_urls: Whether to inline S3 URLs. Defaults to True.
|
||||
offset: The offset to start from. Defaults to 0.
|
||||
limit: The maximum number of examples to return.
|
||||
filter: A structured fileter string to apply to the examples.
|
||||
client: LangSmith Client. If not provided will be initialized from below args.
|
||||
client_kwargs: Keyword args to pass to LangSmith client init. Should only be
|
||||
specified if ``client`` isn't.
|
||||
""" # noqa: E501
|
||||
if client and client_kwargs:
|
||||
raise ValueError
|
||||
self._client = client or LangSmithClient(**client_kwargs)
|
||||
self.content_key = list(content_key.split(".")) if content_key else []
|
||||
self.format_content = format_content or _stringify
|
||||
self.dataset_id = dataset_id
|
||||
self.dataset_name = dataset_name
|
||||
self.example_ids = example_ids
|
||||
self.as_of = as_of
|
||||
self.splits = splits
|
||||
self.inline_s3_urls = inline_s3_urls
|
||||
self.offset = offset
|
||||
self.limit = limit
|
||||
self.metadata = metadata
|
||||
self.filter = filter
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
for example in self._client.list_examples(
|
||||
dataset_id=self.dataset_id,
|
||||
dataset_name=self.dataset_name,
|
||||
example_ids=self.example_ids,
|
||||
as_of=self.as_of,
|
||||
splits=self.splits,
|
||||
inline_s3_urls=self.inline_s3_urls,
|
||||
offset=self.offset,
|
||||
limit=self.limit,
|
||||
metadata=self.metadata,
|
||||
filter=self.filter,
|
||||
):
|
||||
content: Any = example.inputs
|
||||
for key in self.content_key:
|
||||
content = content[key]
|
||||
content_str = self.format_content(content)
|
||||
metadata = example.dict()
|
||||
# Stringify datetime and UUID types.
|
||||
for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"):
|
||||
metadata[k] = str(metadata[k]) if metadata[k] else metadata[k]
|
||||
yield Document(content_str, metadata=metadata)
|
||||
|
||||
|
||||
def _stringify(x: Union[str, dict]) -> str:
|
||||
if isinstance(x, str):
|
||||
return x
|
||||
else:
|
||||
try:
|
||||
return json.dumps(x, indent=2)
|
||||
except Exception:
|
||||
return str(x)
|
||||
@@ -32,6 +32,7 @@ def _has_next(iterator: Iterator) -> bool:
|
||||
return next(iterator, sentinel) is not sentinel
|
||||
|
||||
|
||||
@beta()
|
||||
class Node(Serializable):
|
||||
"""Node in the GraphVectorStore.
|
||||
|
||||
@@ -115,6 +116,7 @@ def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[Node]:
|
||||
)
|
||||
|
||||
|
||||
@beta()
|
||||
def nodes_to_documents(nodes: Iterable[Node]) -> Iterator[Document]:
|
||||
for node in nodes:
|
||||
metadata = node.metadata.copy()
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, List, Literal, Union
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
@beta()
|
||||
@dataclass(frozen=True)
|
||||
class Link:
|
||||
"""A link to/from a tag of a given tag.
|
||||
@@ -38,6 +40,7 @@ class Link:
|
||||
METADATA_LINKS_KEY = "links"
|
||||
|
||||
|
||||
@beta()
|
||||
def get_links(doc: Document) -> List[Link]:
|
||||
"""Get the links from a document.
|
||||
Args:
|
||||
@@ -54,6 +57,7 @@ def get_links(doc: Document) -> List[Link]:
|
||||
return links
|
||||
|
||||
|
||||
@beta()
|
||||
def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
||||
"""Add links to the given metadata.
|
||||
Args:
|
||||
@@ -68,6 +72,7 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
||||
links_in_metadata.append(link)
|
||||
|
||||
|
||||
@beta()
|
||||
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
|
||||
"""Return a document with the given links added.
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ https://python.langchain.com/v0.2/docs/how_to/custom_llm/
|
||||
|
||||
from langchain_core.language_models.base import (
|
||||
BaseLanguageModel,
|
||||
LangSmithParams,
|
||||
LanguageModelInput,
|
||||
LanguageModelLike,
|
||||
LanguageModelOutput,
|
||||
@@ -62,6 +63,7 @@ __all__ = [
|
||||
"LLM",
|
||||
"LanguageModelInput",
|
||||
"get_tokenizer",
|
||||
"LangSmithParams",
|
||||
"LanguageModelOutput",
|
||||
"LanguageModelLike",
|
||||
"FakeListLLM",
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import (
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
@@ -17,7 +18,7 @@ from typing import (
|
||||
Union,
|
||||
)
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
from typing_extensions import TypeAlias, TypedDict
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.messages import (
|
||||
@@ -37,6 +38,23 @@ if TYPE_CHECKING:
|
||||
from langchain_core.outputs import LLMResult
|
||||
|
||||
|
||||
class LangSmithParams(TypedDict, total=False):
|
||||
"""LangSmith parameters for tracing."""
|
||||
|
||||
ls_provider: str
|
||||
"""Provider of the model."""
|
||||
ls_model_name: str
|
||||
"""Name of the model."""
|
||||
ls_model_type: Literal["chat", "llm"]
|
||||
"""Type of the model. Should be 'chat' or 'llm'."""
|
||||
ls_temperature: Optional[float]
|
||||
"""Temperature for generation."""
|
||||
ls_max_tokens: Optional[int]
|
||||
"""Max tokens for generation."""
|
||||
ls_stop: Optional[List[str]]
|
||||
"""Stop words for generation."""
|
||||
|
||||
|
||||
@lru_cache(maxsize=None) # Cache the tokenizer
|
||||
def get_tokenizer() -> Any:
|
||||
"""Get a GPT-2 tokenizer instance.
|
||||
|
||||
@@ -23,8 +23,6 @@ from typing import (
|
||||
cast,
|
||||
)
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.caches import BaseCache
|
||||
from langchain_core.callbacks import (
|
||||
@@ -36,7 +34,11 @@ from langchain_core.callbacks import (
|
||||
Callbacks,
|
||||
)
|
||||
from langchain_core.globals import get_llm_cache
|
||||
from langchain_core.language_models.base import BaseLanguageModel, LanguageModelInput
|
||||
from langchain_core.language_models.base import (
|
||||
BaseLanguageModel,
|
||||
LangSmithParams,
|
||||
LanguageModelInput,
|
||||
)
|
||||
from langchain_core.load import dumpd, dumps
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
@@ -73,23 +75,6 @@ if TYPE_CHECKING:
|
||||
from langchain_core.tools import BaseTool
|
||||
|
||||
|
||||
class LangSmithParams(TypedDict, total=False):
|
||||
"""LangSmith parameters for tracing."""
|
||||
|
||||
ls_provider: str
|
||||
"""Provider of the model."""
|
||||
ls_model_name: str
|
||||
"""Name of the model."""
|
||||
ls_model_type: Literal["chat"]
|
||||
"""Type of the model. Should be 'chat'."""
|
||||
ls_temperature: Optional[float]
|
||||
"""Temperature for generation."""
|
||||
ls_max_tokens: Optional[int]
|
||||
"""Max tokens for generation."""
|
||||
ls_stop: Optional[List[str]]
|
||||
"""Stop words for generation."""
|
||||
|
||||
|
||||
def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult:
|
||||
"""Generate from a stream.
|
||||
|
||||
|
||||
@@ -48,7 +48,11 @@ from langchain_core.callbacks import (
|
||||
Callbacks,
|
||||
)
|
||||
from langchain_core.globals import get_llm_cache
|
||||
from langchain_core.language_models.base import BaseLanguageModel, LanguageModelInput
|
||||
from langchain_core.language_models.base import (
|
||||
BaseLanguageModel,
|
||||
LangSmithParams,
|
||||
LanguageModelInput,
|
||||
)
|
||||
from langchain_core.load import dumpd
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
@@ -331,6 +335,43 @@ class BaseLLM(BaseLanguageModel[str], ABC):
|
||||
"Must be a PromptValue, str, or list of BaseMessages."
|
||||
)
|
||||
|
||||
def _get_ls_params(
|
||||
self,
|
||||
stop: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> LangSmithParams:
|
||||
"""Get standard params for tracing."""
|
||||
|
||||
# get default provider from class name
|
||||
default_provider = self.__class__.__name__
|
||||
if default_provider.endswith("LLM"):
|
||||
default_provider = default_provider[:-3]
|
||||
default_provider = default_provider.lower()
|
||||
|
||||
ls_params = LangSmithParams(ls_provider=default_provider, ls_model_type="llm")
|
||||
if stop:
|
||||
ls_params["ls_stop"] = stop
|
||||
|
||||
# model
|
||||
if hasattr(self, "model") and isinstance(self.model, str):
|
||||
ls_params["ls_model_name"] = self.model
|
||||
elif hasattr(self, "model_name") and isinstance(self.model_name, str):
|
||||
ls_params["ls_model_name"] = self.model_name
|
||||
|
||||
# temperature
|
||||
if "temperature" in kwargs and isinstance(kwargs["temperature"], float):
|
||||
ls_params["ls_temperature"] = kwargs["temperature"]
|
||||
elif hasattr(self, "temperature") and isinstance(self.temperature, float):
|
||||
ls_params["ls_temperature"] = self.temperature
|
||||
|
||||
# max_tokens
|
||||
if "max_tokens" in kwargs and isinstance(kwargs["max_tokens"], int):
|
||||
ls_params["ls_max_tokens"] = kwargs["max_tokens"]
|
||||
elif hasattr(self, "max_tokens") and isinstance(self.max_tokens, int):
|
||||
ls_params["ls_max_tokens"] = self.max_tokens
|
||||
|
||||
return ls_params
|
||||
|
||||
def invoke(
|
||||
self,
|
||||
input: LanguageModelInput,
|
||||
@@ -487,13 +528,17 @@ class BaseLLM(BaseLanguageModel[str], ABC):
|
||||
params["stop"] = stop
|
||||
params = {**params, **kwargs}
|
||||
options = {"stop": stop}
|
||||
inheritable_metadata = {
|
||||
**(config.get("metadata") or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
callback_manager = CallbackManager.configure(
|
||||
config.get("callbacks"),
|
||||
self.callbacks,
|
||||
self.verbose,
|
||||
config.get("tags"),
|
||||
self.tags,
|
||||
config.get("metadata"),
|
||||
inheritable_metadata,
|
||||
self.metadata,
|
||||
)
|
||||
(run_manager,) = callback_manager.on_llm_start(
|
||||
@@ -548,13 +593,17 @@ class BaseLLM(BaseLanguageModel[str], ABC):
|
||||
params["stop"] = stop
|
||||
params = {**params, **kwargs}
|
||||
options = {"stop": stop}
|
||||
inheritable_metadata = {
|
||||
**(config.get("metadata") or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
callback_manager = AsyncCallbackManager.configure(
|
||||
config.get("callbacks"),
|
||||
self.callbacks,
|
||||
self.verbose,
|
||||
config.get("tags"),
|
||||
self.tags,
|
||||
config.get("metadata"),
|
||||
inheritable_metadata,
|
||||
self.metadata,
|
||||
)
|
||||
(run_manager,) = await callback_manager.on_llm_start(
|
||||
@@ -796,6 +845,21 @@ class BaseLLM(BaseLanguageModel[str], ABC):
|
||||
f" argument of type {type(prompts)}."
|
||||
)
|
||||
# Create callback managers
|
||||
if isinstance(metadata, list):
|
||||
metadata = [
|
||||
{
|
||||
**(meta or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
for meta in metadata
|
||||
]
|
||||
elif isinstance(metadata, dict):
|
||||
metadata = {
|
||||
**(metadata or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
else:
|
||||
pass
|
||||
if (
|
||||
isinstance(callbacks, list)
|
||||
and callbacks
|
||||
@@ -1017,6 +1081,21 @@ class BaseLLM(BaseLanguageModel[str], ABC):
|
||||
An LLMResult, which contains a list of candidate Generations for each input
|
||||
prompt and additional model provider-specific output.
|
||||
"""
|
||||
if isinstance(metadata, list):
|
||||
metadata = [
|
||||
{
|
||||
**(meta or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
for meta in metadata
|
||||
]
|
||||
elif isinstance(metadata, dict):
|
||||
metadata = {
|
||||
**(metadata or {}),
|
||||
**self._get_ls_params(stop=stop, **kwargs),
|
||||
}
|
||||
else:
|
||||
pass
|
||||
# Create callback managers
|
||||
if isinstance(callbacks, list) and (
|
||||
isinstance(callbacks[0], (list, BaseCallbackManager))
|
||||
|
||||
@@ -514,6 +514,8 @@ def merge_message_runs(
|
||||
return merged
|
||||
|
||||
|
||||
# TODO: Update so validation errors (for token_counter, for example) are raised on
|
||||
# init not at runtime.
|
||||
@_runnable_support
|
||||
def trim_messages(
|
||||
messages: Union[Iterable[MessageLikeRepresentation], PromptValue],
|
||||
@@ -759,24 +761,30 @@ def trim_messages(
|
||||
AIMessage("This is a 4 token text. The full message is 10 tokens.", id="fourth"),
|
||||
]
|
||||
""" # noqa: E501
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
|
||||
if start_on and strategy == "first":
|
||||
raise ValueError
|
||||
if include_system and strategy == "first":
|
||||
raise ValueError
|
||||
messages = convert_to_messages(messages)
|
||||
if isinstance(token_counter, BaseLanguageModel):
|
||||
list_token_counter = token_counter.get_num_tokens_from_messages
|
||||
elif (
|
||||
list(inspect.signature(token_counter).parameters.values())[0].annotation
|
||||
is BaseMessage
|
||||
):
|
||||
if hasattr(token_counter, "get_num_tokens_from_messages"):
|
||||
list_token_counter = getattr(token_counter, "get_num_tokens_from_messages")
|
||||
elif callable(token_counter):
|
||||
if (
|
||||
list(inspect.signature(token_counter).parameters.values())[0].annotation
|
||||
is BaseMessage
|
||||
):
|
||||
|
||||
def list_token_counter(messages: Sequence[BaseMessage]) -> int:
|
||||
return sum(token_counter(msg) for msg in messages) # type: ignore[arg-type, misc]
|
||||
def list_token_counter(messages: Sequence[BaseMessage]) -> int:
|
||||
return sum(token_counter(msg) for msg in messages) # type: ignore[arg-type, misc]
|
||||
else:
|
||||
list_token_counter = token_counter # type: ignore[assignment]
|
||||
else:
|
||||
list_token_counter = token_counter # type: ignore[assignment]
|
||||
raise ValueError(
|
||||
f"'token_counter' expected ot be a model that implements "
|
||||
f"'get_num_tokens_from_messages()' or a function. Received object of type "
|
||||
f"{type(token_counter)}."
|
||||
)
|
||||
|
||||
try:
|
||||
from langchain_text_splitters import TextSplitter
|
||||
|
||||
@@ -348,37 +348,7 @@ def merge_configs(*configs: Optional[RunnableConfig]) -> RunnableConfig:
|
||||
base["callbacks"] = mngr
|
||||
else:
|
||||
# base_callbacks is also a manager
|
||||
manager = base_callbacks.__class__(
|
||||
parent_run_id=base_callbacks.parent_run_id
|
||||
or these_callbacks.parent_run_id,
|
||||
handlers=[],
|
||||
inheritable_handlers=[],
|
||||
tags=list(set(base_callbacks.tags + these_callbacks.tags)),
|
||||
inheritable_tags=list(
|
||||
set(
|
||||
base_callbacks.inheritable_tags
|
||||
+ these_callbacks.inheritable_tags
|
||||
)
|
||||
),
|
||||
metadata={
|
||||
**base_callbacks.metadata,
|
||||
**these_callbacks.metadata,
|
||||
},
|
||||
)
|
||||
|
||||
handlers = base_callbacks.handlers + these_callbacks.handlers
|
||||
inheritable_handlers = (
|
||||
base_callbacks.inheritable_handlers
|
||||
+ these_callbacks.inheritable_handlers
|
||||
)
|
||||
|
||||
for handler in handlers:
|
||||
manager.add_handler(handler)
|
||||
|
||||
for handler in inheritable_handlers:
|
||||
manager.add_handler(handler, inherit=True)
|
||||
|
||||
base["callbacks"] = manager
|
||||
base["callbacks"] = base_callbacks.merge(these_callbacks)
|
||||
elif key == "recursion_limit":
|
||||
if config["recursion_limit"] != DEFAULT_RECURSION_LIMIT:
|
||||
base["recursion_limit"] = config["recursion_limit"]
|
||||
|
||||
@@ -45,12 +45,14 @@ from langchain_core.tools.convert import (
|
||||
convert_runnable_to_tool as convert_runnable_to_tool,
|
||||
)
|
||||
from langchain_core.tools.convert import tool as tool
|
||||
from langchain_core.tools.render import ToolsRenderer as ToolsRenderer
|
||||
from langchain_core.tools.render import (
|
||||
render_text_description as render_text_description,
|
||||
)
|
||||
from langchain_core.tools.render import (
|
||||
render_text_description_and_args as render_text_description_and_args,
|
||||
)
|
||||
from langchain_core.tools.retriever import RetrieverInput as RetrieverInput
|
||||
from langchain_core.tools.retriever import (
|
||||
create_retriever_tool as create_retriever_tool,
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@ import importlib
|
||||
import os
|
||||
import warnings
|
||||
from importlib.metadata import version
|
||||
from typing import Any, Callable, Dict, Optional, Set, Tuple, Union, overload
|
||||
from typing import Any, Callable, Dict, Optional, Sequence, Set, Tuple, Union, overload
|
||||
|
||||
from packaging.version import parse
|
||||
from requests import HTTPError, Response
|
||||
@@ -280,13 +280,17 @@ def from_env(key: str, /) -> Callable[[], str]: ...
|
||||
def from_env(key: str, /, *, default: str) -> Callable[[], str]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def from_env(key: Sequence[str], /, *, default: str) -> Callable[[], str]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def from_env(key: str, /, *, error_message: str) -> Callable[[], str]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def from_env(
|
||||
key: str, /, *, default: str, error_message: Optional[str]
|
||||
key: Union[str, Sequence[str]], /, *, default: str, error_message: Optional[str]
|
||||
) -> Callable[[], str]: ...
|
||||
|
||||
|
||||
@@ -296,8 +300,12 @@ def from_env(
|
||||
) -> Callable[[], Optional[str]]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def from_env(key: str, /, *, default: None) -> Callable[[], Optional[str]]: ...
|
||||
|
||||
|
||||
def from_env(
|
||||
key: str,
|
||||
key: Union[str, Sequence[str]],
|
||||
/,
|
||||
*,
|
||||
default: Union[str, _NoDefaultType, None] = _NoDefault,
|
||||
@@ -306,7 +314,10 @@ def from_env(
|
||||
"""Create a factory method that gets a value from an environment variable.
|
||||
|
||||
Args:
|
||||
key: The environment variable to look up.
|
||||
key: The environment variable to look up. If a list of keys is provided,
|
||||
the first key found in the environment will be used.
|
||||
If no key is found, the default value will be used if set,
|
||||
otherwise an error will be raised.
|
||||
default: The default value to return if the environment variable is not set.
|
||||
error_message: the error message which will be raised if the key is not found
|
||||
and no default value is provided.
|
||||
@@ -315,9 +326,15 @@ def from_env(
|
||||
|
||||
def get_from_env_fn() -> Optional[str]:
|
||||
"""Get a value from an environment variable."""
|
||||
if key in os.environ:
|
||||
return os.environ[key]
|
||||
elif isinstance(default, (str, type(None))):
|
||||
if isinstance(key, (list, tuple)):
|
||||
for k in key:
|
||||
if k in os.environ:
|
||||
return os.environ[k]
|
||||
if isinstance(key, str):
|
||||
if key in os.environ:
|
||||
return os.environ[key]
|
||||
|
||||
if isinstance(default, (str, type(None))):
|
||||
return default
|
||||
else:
|
||||
if error_message:
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-core"
|
||||
version = "0.2.31"
|
||||
version = "0.2.33"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
import datetime
|
||||
import uuid
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from langsmith.schemas import Example
|
||||
|
||||
from langchain_core.document_loaders import LangSmithLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
def test_init() -> None:
|
||||
LangSmithLoader(api_key="secret")
|
||||
|
||||
|
||||
EXAMPLES = [
|
||||
Example(
|
||||
inputs={"first": {"second": "foo"}},
|
||||
outputs={"res": "a"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
Example(
|
||||
inputs={"first": {"second": "bar"}},
|
||||
outputs={"res": "b"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
Example(
|
||||
inputs={"first": {"second": "baz"}},
|
||||
outputs={"res": "c"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@patch("langsmith.Client.list_examples", MagicMock(return_value=iter(EXAMPLES)))
|
||||
def test_lazy_load() -> None:
|
||||
loader = LangSmithLoader(
|
||||
api_key="dummy",
|
||||
dataset_id="mock",
|
||||
content_key="first.second",
|
||||
format_content=(lambda x: x.upper()),
|
||||
)
|
||||
expected = []
|
||||
for example in EXAMPLES:
|
||||
metadata = {
|
||||
k: v if not v or isinstance(v, dict) else str(v)
|
||||
for k, v in example.dict().items()
|
||||
}
|
||||
expected.append(
|
||||
Document(example.inputs["first"]["second"].upper(), metadata=metadata)
|
||||
)
|
||||
actual = [doc for doc in loader.lazy_load()]
|
||||
assert expected == actual
|
||||
@@ -6,6 +6,7 @@ EXPECTED_ALL = [
|
||||
"SimpleChatModel",
|
||||
"BaseLLM",
|
||||
"LLM",
|
||||
"LangSmithParams",
|
||||
"LanguageModelInput",
|
||||
"LanguageModelOutput",
|
||||
"LanguageModelLike",
|
||||
|
||||
@@ -2,6 +2,7 @@ from typing import Dict, List, Type
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_core.language_models.fake_chat_models import FakeChatModel
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
BaseMessage,
|
||||
@@ -316,6 +317,19 @@ def test_trim_messages_invoke() -> None:
|
||||
assert actual == expected
|
||||
|
||||
|
||||
def test_trim_messages_bound_model_token_counter() -> None:
|
||||
trimmer = trim_messages(
|
||||
max_tokens=10, token_counter=FakeTokenCountingModel().bind(foo="bar")
|
||||
)
|
||||
trimmer.invoke([HumanMessage("foobar")])
|
||||
|
||||
|
||||
def test_trim_messages_bad_token_counter() -> None:
|
||||
trimmer = trim_messages(max_tokens=10, token_counter={})
|
||||
with pytest.raises(ValueError):
|
||||
trimmer.invoke([HumanMessage("foobar")])
|
||||
|
||||
|
||||
def dummy_token_counter(messages: List[BaseMessage]) -> int:
|
||||
# treat each message like it adds 3 default tokens at the beginning
|
||||
# of the message and at the end of the message. 3 + 4 + 3 = 10 tokens
|
||||
@@ -338,3 +352,8 @@ def dummy_token_counter(messages: List[BaseMessage]) -> int:
|
||||
+ default_msg_suffix_len
|
||||
)
|
||||
return count
|
||||
|
||||
|
||||
class FakeTokenCountingModel(FakeChatModel):
|
||||
def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
|
||||
return dummy_token_counter(messages)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user