Compare commits
1 Commits
harrison/p
...
eugene/enu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71450f2a7e |
22
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -1,20 +1,28 @@
|
||||
<!-- Thank you for contributing to LangChain!
|
||||
|
||||
Replace this entire comment with:
|
||||
Replace this comment with:
|
||||
- Description: a description of the change,
|
||||
- Issue: the issue # it fixes (if applicable),
|
||||
- Dependencies: any dependencies required for this change,
|
||||
- Tag maintainer: for a quicker response, tag the relevant maintainer (see below),
|
||||
- Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out!
|
||||
|
||||
Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally.
|
||||
|
||||
See contribution guidelines for more information on how to write/run tests, lint, etc:
|
||||
https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
|
||||
Please make sure you're PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally.
|
||||
|
||||
If you're adding a new integration, please include:
|
||||
1. a test for the integration, preferably unit tests that do not rely on network access,
|
||||
2. an example notebook showing its use. These live is docs/extras directory.
|
||||
2. an example notebook showing its use.
|
||||
|
||||
If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17, @rlancemartin.
|
||||
Maintainer responsibilities:
|
||||
- General / Misc / if you don't know who to tag: @baskaryan
|
||||
- DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
|
||||
- Models / Prompts: @hwchase17, @baskaryan
|
||||
- Memory: @hwchase17
|
||||
- Agents / Tools / Toolkits: @hinthornw
|
||||
- Tracing / Callbacks: @agola11
|
||||
- Async: @agola11
|
||||
|
||||
If no one reviews your PR within a few days, feel free to @-mention the same people again.
|
||||
|
||||
See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
|
||||
-->
|
||||
|
||||
41
.github/workflows/_test.yml
vendored
@@ -10,7 +10,7 @@ on:
|
||||
test_type:
|
||||
type: string
|
||||
description: "Test types to run"
|
||||
default: '["core", "extended", "core-pydantic-2"]'
|
||||
default: '["core", "extended"]'
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.4.2"
|
||||
@@ -43,42 +43,19 @@ jobs:
|
||||
if [ "${{ matrix.test_type }}" == "core" ]; then
|
||||
echo "Running core tests, installing dependencies with poetry..."
|
||||
poetry install
|
||||
elif [ "${{ matrix.test_type }}" == "core-pydantic-2" ]; then
|
||||
echo "Running core-pydantic-v2 tests, installing dependencies with poetry..."
|
||||
poetry install
|
||||
poetry add pydantic@2.1
|
||||
else
|
||||
echo "Running extended tests, installing dependencies with poetry..."
|
||||
poetry install -E extended_testing
|
||||
fi
|
||||
- name: Verify pydantic version
|
||||
- name: Install langchain editable
|
||||
if: ${{ inputs.working-directory != 'langchain' }}
|
||||
run: |
|
||||
if [ "${{ matrix.test_type }}" == "core-pydantic-2" ]; then
|
||||
EXPECTED_VERSION=2
|
||||
else
|
||||
EXPECTED_VERSION=1
|
||||
fi
|
||||
echo "Checking pydantic version... Expecting ${EXPECTED_VERSION}"
|
||||
|
||||
# Determine the major part of pydantic version
|
||||
VERSION=$(poetry run python -c "import pydantic; print(pydantic.__version__)" | cut -d. -f1)
|
||||
|
||||
# Check that the major part of pydantic version is as expected, if not
|
||||
# raise an error
|
||||
if [[ "$VERSION" -ne $EXPECTED_VERSION ]]; then
|
||||
echo "Error: pydantic version must be equal to ${EXPECTED_VERSION}; Found: ${VERSION}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Found pydantic version ${VERSION}, as expected"
|
||||
shell: bash
|
||||
pip install -e ../langchain
|
||||
- name: Run ${{matrix.test_type}} tests
|
||||
run: |
|
||||
case "${{ matrix.test_type }}" in
|
||||
core | core-pydantic-2)
|
||||
make test
|
||||
;;
|
||||
*)
|
||||
make extended_tests
|
||||
;;
|
||||
esac
|
||||
if [ "${{ matrix.test_type }}" == "core" ]; then
|
||||
make test
|
||||
else
|
||||
make extended_tests
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
1
.github/workflows/langchain_ci.yml
vendored
@@ -24,5 +24,4 @@ jobs:
|
||||
./.github/workflows/_test.yml
|
||||
with:
|
||||
working-directory: libs/langchain
|
||||
test_type: '["core", "extended"]'
|
||||
secrets: inherit
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: libs/experimental CI
|
||||
name: libs/langchain-experimental CI
|
||||
|
||||
on:
|
||||
push:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: libs/experimental Release
|
||||
name: libs/langchain-experimental Release
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
42
.github/workflows/scheduled_test.yml
vendored
@@ -1,42 +0,0 @@
|
||||
name: Scheduled tests
|
||||
|
||||
on:
|
||||
workflow_dispatch: # Allows to trigger the workflow manually in GitHub UI
|
||||
schedule:
|
||||
- cron: '0 13 * * *'
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.4.2"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
defaults:
|
||||
run:
|
||||
working-directory: libs/langchain
|
||||
runs-on: ubuntu-latest
|
||||
environment: Scheduled testing
|
||||
strategy:
|
||||
matrix:
|
||||
python-version:
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
- "3.10"
|
||||
- "3.11"
|
||||
name: Python ${{ matrix.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: "./.github/actions/poetry_setup"
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
poetry-version: "1.4.2"
|
||||
working-directory: libs/langchain
|
||||
install-command: |
|
||||
echo "Running scheduled tests, installing dependencies with poetry..."
|
||||
poetry install --with=test_integration
|
||||
- name: Run tests
|
||||
env:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: |
|
||||
make scheduled_tests
|
||||
shell: bash
|
||||
7
Makefile
@@ -43,12 +43,7 @@ spell_fix:
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'clean - run docs_clean and api_docs_clean'
|
||||
@echo 'coverage - run unit tests and generate coverage report'
|
||||
@echo 'docs_build - build the documentation'
|
||||
@echo 'docs_clean - clean the documentation build artifacts'
|
||||
@echo 'docs_linkcheck - run linkchecker on the documentation'
|
||||
@echo 'api_docs_build - build the API Reference documentation'
|
||||
@echo 'api_docs_clean - clean the API Reference documentation build artifacts'
|
||||
@echo 'api_docs_linkcheck - run linkchecker on the API Reference documentation'
|
||||
@echo 'spell_check - run codespell on the project'
|
||||
@echo 'spell_fix - run codespell on the project and fix the errors'
|
||||
@@ -18,10 +18,10 @@
|
||||
|
||||
Looking for the JS/TS version? Check out [LangChain.js](https://github.com/hwchase17/langchainjs).
|
||||
|
||||
**Production Support:** As you move your LangChains into production, we'd love to offer more hands-on support.
|
||||
Fill out [this form](https://airtable.com/appwQzlErAS2qiP0L/shrGtGaVBVAz7NcV2) to share more about what you're building, and our team will get in touch.
|
||||
**Production Support:** As you move your LangChains into production, we'd love to offer more comprehensive support.
|
||||
Please fill out [this form](https://6w1pwbss0py.typeform.com/to/rrbrdTH2) and we'll set up a dedicated support Slack channel.
|
||||
|
||||
## 🚨Breaking Changes for select chains (SQLDatabase) on 7/28/23
|
||||
## 🚨Breaking Changes for select chains (SQLDatabase) on 7/28
|
||||
|
||||
In an effort to make `langchain` leaner and safer, we are moving select chains to `langchain_experimental`.
|
||||
This migration has already started, but we are remaining backwards compatible until 7/28.
|
||||
|
||||
@@ -100,9 +100,6 @@ extensions = [
|
||||
]
|
||||
source_suffix = [".rst"]
|
||||
|
||||
# some autodoc pydantic options are repeated in the actual template.
|
||||
# potentially user error, but there may be bugs in the sphinx extension
|
||||
# with options not being passed through correctly (from either the location in the code)
|
||||
autodoc_pydantic_model_show_json = False
|
||||
autodoc_pydantic_field_list_validators = False
|
||||
autodoc_pydantic_config_members = False
|
||||
@@ -115,6 +112,13 @@ autodoc_member_order = "groupwise"
|
||||
autoclass_content = "both"
|
||||
autodoc_typehints_format = "short"
|
||||
|
||||
autodoc_default_options = {
|
||||
"members": True,
|
||||
"show-inheritance": True,
|
||||
"inherited-members": "BaseModel",
|
||||
"undoc-members": True,
|
||||
"special-members": "__call__",
|
||||
}
|
||||
# autodoc_typehints = "description"
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["templates"]
|
||||
|
||||
@@ -1,216 +1,49 @@
|
||||
"""Script for auto-generating api_reference.rst."""
|
||||
import importlib
|
||||
import inspect
|
||||
import typing
|
||||
"""Script for auto-generating api_reference.rst"""
|
||||
import glob
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import TypedDict, Sequence, List, Dict, Literal, Union
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
ROOT_DIR = Path(__file__).parents[2].absolute()
|
||||
HERE = Path(__file__).parent
|
||||
|
||||
PKG_DIR = ROOT_DIR / "libs" / "langchain" / "langchain"
|
||||
EXP_DIR = ROOT_DIR / "libs" / "experimental" / "langchain_experimental"
|
||||
WRITE_FILE = HERE / "api_reference.rst"
|
||||
EXP_WRITE_FILE = HERE / "experimental_api_reference.rst"
|
||||
WRITE_FILE = Path(__file__).parent / "api_reference.rst"
|
||||
EXP_WRITE_FILE = Path(__file__).parent / "experimental_api_reference.rst"
|
||||
|
||||
|
||||
ClassKind = Literal["TypedDict", "Regular", "Pydantic", "enum"]
|
||||
def load_members(dir: Path) -> dict:
|
||||
members: dict = {}
|
||||
for py in glob.glob(str(dir) + "/**/*.py", recursive=True):
|
||||
module = py[len(str(dir)) + 1 :].replace(".py", "").replace("/", ".")
|
||||
top_level = module.split(".")[0]
|
||||
if top_level not in members:
|
||||
members[top_level] = {"classes": [], "functions": []}
|
||||
with open(py, "r") as f:
|
||||
for line in f.readlines():
|
||||
cls = re.findall(r"^class ([^_].*)\(", line)
|
||||
members[top_level]["classes"].extend([module + "." + c for c in cls])
|
||||
func = re.findall(r"^def ([^_].*)\(", line)
|
||||
afunc = re.findall(r"^async def ([^_].*)\(", line)
|
||||
func_strings = [module + "." + f for f in func + afunc]
|
||||
members[top_level]["functions"].extend(func_strings)
|
||||
return members
|
||||
|
||||
|
||||
class ClassInfo(TypedDict):
|
||||
"""Information about a class."""
|
||||
|
||||
name: str
|
||||
"""The name of the class."""
|
||||
qualified_name: str
|
||||
"""The fully qualified name of the class."""
|
||||
kind: ClassKind
|
||||
"""The kind of the class."""
|
||||
is_public: bool
|
||||
"""Whether the class is public or not."""
|
||||
|
||||
|
||||
class FunctionInfo(TypedDict):
|
||||
"""Information about a function."""
|
||||
|
||||
name: str
|
||||
"""The name of the function."""
|
||||
qualified_name: str
|
||||
"""The fully qualified name of the function."""
|
||||
is_public: bool
|
||||
"""Whether the function is public or not."""
|
||||
|
||||
|
||||
class ModuleMembers(TypedDict):
|
||||
"""A dictionary of module members."""
|
||||
|
||||
classes_: Sequence[ClassInfo]
|
||||
functions: Sequence[FunctionInfo]
|
||||
|
||||
|
||||
def _load_module_members(module_path: str, namespace: str) -> ModuleMembers:
|
||||
"""Load all members of a module.
|
||||
|
||||
Args:
|
||||
module_path: Path to the module.
|
||||
namespace: the namespace of the module.
|
||||
|
||||
Returns:
|
||||
list: A list of loaded module objects.
|
||||
"""
|
||||
classes_: List[ClassInfo] = []
|
||||
functions: List[FunctionInfo] = []
|
||||
module = importlib.import_module(module_path)
|
||||
for name, type_ in inspect.getmembers(module):
|
||||
if not hasattr(type_, "__module__"):
|
||||
continue
|
||||
if type_.__module__ != module_path:
|
||||
continue
|
||||
|
||||
if inspect.isclass(type_):
|
||||
if type(type_) == typing._TypedDictMeta: # type: ignore
|
||||
kind: ClassKind = "TypedDict"
|
||||
elif issubclass(type_, Enum):
|
||||
kind = "enum"
|
||||
elif issubclass(type_, BaseModel):
|
||||
kind = "Pydantic"
|
||||
else:
|
||||
kind = "Regular"
|
||||
|
||||
classes_.append(
|
||||
ClassInfo(
|
||||
name=name,
|
||||
qualified_name=f"{namespace}.{name}",
|
||||
kind=kind,
|
||||
is_public=not name.startswith("_"),
|
||||
)
|
||||
)
|
||||
elif inspect.isfunction(type_):
|
||||
functions.append(
|
||||
FunctionInfo(
|
||||
name=name,
|
||||
qualified_name=f"{namespace}.{name}",
|
||||
is_public=not name.startswith("_"),
|
||||
)
|
||||
)
|
||||
else:
|
||||
continue
|
||||
|
||||
return ModuleMembers(
|
||||
classes_=classes_,
|
||||
functions=functions,
|
||||
)
|
||||
|
||||
|
||||
def _merge_module_members(
|
||||
module_members: Sequence[ModuleMembers],
|
||||
) -> ModuleMembers:
|
||||
"""Merge module members."""
|
||||
classes_: List[ClassInfo] = []
|
||||
functions: List[FunctionInfo] = []
|
||||
for module in module_members:
|
||||
classes_.extend(module["classes_"])
|
||||
functions.extend(module["functions"])
|
||||
|
||||
return ModuleMembers(
|
||||
classes_=classes_,
|
||||
functions=functions,
|
||||
)
|
||||
|
||||
|
||||
def _load_package_modules(
|
||||
package_directory: Union[str, Path]
|
||||
) -> Dict[str, ModuleMembers]:
|
||||
"""Recursively load modules of a package based on the file system.
|
||||
|
||||
Traversal based on the file system makes it easy to determine which
|
||||
of the modules/packages are part of the package vs. 3rd party or built-in.
|
||||
|
||||
Parameters:
|
||||
package_directory: Path to the package directory.
|
||||
|
||||
Returns:
|
||||
list: A list of loaded module objects.
|
||||
"""
|
||||
package_path = (
|
||||
Path(package_directory)
|
||||
if isinstance(package_directory, str)
|
||||
else package_directory
|
||||
)
|
||||
modules_by_namespace = {}
|
||||
|
||||
package_name = package_path.name
|
||||
|
||||
for file_path in package_path.rglob("*.py"):
|
||||
if file_path.name.startswith("_"):
|
||||
continue
|
||||
|
||||
relative_module_name = file_path.relative_to(package_path)
|
||||
|
||||
# Skip if any module part starts with an underscore
|
||||
if any(part.startswith("_") for part in relative_module_name.parts):
|
||||
continue
|
||||
|
||||
# Get the full namespace of the module
|
||||
namespace = str(relative_module_name).replace(".py", "").replace("/", ".")
|
||||
# Keep only the top level namespace
|
||||
top_namespace = namespace.split(".")[0]
|
||||
|
||||
try:
|
||||
module_members = _load_module_members(
|
||||
f"{package_name}.{namespace}", namespace
|
||||
)
|
||||
# Merge module members if the namespace already exists
|
||||
if top_namespace in modules_by_namespace:
|
||||
existing_module_members = modules_by_namespace[top_namespace]
|
||||
_module_members = _merge_module_members(
|
||||
[existing_module_members, module_members]
|
||||
)
|
||||
else:
|
||||
_module_members = module_members
|
||||
|
||||
modules_by_namespace[top_namespace] = _module_members
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Error: Unable to import module '{namespace}' with error: {e}")
|
||||
|
||||
return modules_by_namespace
|
||||
|
||||
|
||||
def _construct_doc(pkg: str, members_by_namespace: Dict[str, ModuleMembers]) -> str:
|
||||
"""Construct the contents of the reference.rst file for the given package.
|
||||
|
||||
Args:
|
||||
pkg: The package name
|
||||
members_by_namespace: The members of the package, dict organized by top level
|
||||
module contains a list of classes and functions
|
||||
inside of the top level namespace.
|
||||
|
||||
Returns:
|
||||
The contents of the reference.rst file.
|
||||
"""
|
||||
def construct_doc(pkg: str, members: dict) -> str:
|
||||
full_doc = f"""\
|
||||
=======================
|
||||
=============
|
||||
``{pkg}`` API Reference
|
||||
=======================
|
||||
=============
|
||||
|
||||
"""
|
||||
namespaces = sorted(members_by_namespace)
|
||||
|
||||
for module in namespaces:
|
||||
_members = members_by_namespace[module]
|
||||
classes = _members["classes_"]
|
||||
for module, _members in sorted(members.items(), key=lambda kv: kv[0]):
|
||||
classes = _members["classes"]
|
||||
functions = _members["functions"]
|
||||
if not (classes or functions):
|
||||
continue
|
||||
section = f":mod:`{pkg}.{module}`"
|
||||
underline = "=" * (len(section) + 1)
|
||||
full_doc += f"""\
|
||||
{section}
|
||||
{underline}
|
||||
{'=' * (len(section) + 1)}
|
||||
|
||||
.. automodule:: {pkg}.{module}
|
||||
:no-members:
|
||||
@@ -219,6 +52,7 @@ def _construct_doc(pkg: str, members_by_namespace: Dict[str, ModuleMembers]) ->
|
||||
"""
|
||||
|
||||
if classes:
|
||||
cstring = "\n ".join(sorted(classes))
|
||||
full_doc += f"""\
|
||||
Classes
|
||||
--------------
|
||||
@@ -226,31 +60,13 @@ Classes
|
||||
|
||||
.. autosummary::
|
||||
:toctree: {module}
|
||||
:template: class.rst
|
||||
|
||||
{cstring}
|
||||
|
||||
"""
|
||||
|
||||
for class_ in classes:
|
||||
if not class_["is_public"]:
|
||||
continue
|
||||
|
||||
if class_["kind"] == "TypedDict":
|
||||
template = "typeddict.rst"
|
||||
elif class_["kind"] == "enum":
|
||||
template = "enum.rst"
|
||||
elif class_["kind"] == "Pydantic":
|
||||
template = "pydantic.rst"
|
||||
else:
|
||||
template = "class.rst"
|
||||
|
||||
full_doc += f"""\
|
||||
:template: {template}
|
||||
|
||||
{class_["qualified_name"]}
|
||||
|
||||
"""
|
||||
|
||||
if functions:
|
||||
_functions = [f["qualified_name"] for f in functions if f["is_public"]]
|
||||
fstring = "\n ".join(sorted(_functions))
|
||||
fstring = "\n ".join(sorted(functions))
|
||||
full_doc += f"""\
|
||||
Functions
|
||||
--------------
|
||||
@@ -267,15 +83,12 @@ Functions
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Generate the reference.rst file for each package."""
|
||||
lc_members = _load_package_modules(PKG_DIR)
|
||||
lc_doc = ".. _api_reference:\n\n" + _construct_doc("langchain", lc_members)
|
||||
lc_members = load_members(PKG_DIR)
|
||||
lc_doc = ".. _api_reference:\n\n" + construct_doc("langchain", lc_members)
|
||||
with open(WRITE_FILE, "w") as f:
|
||||
f.write(lc_doc)
|
||||
exp_members = _load_package_modules(EXP_DIR)
|
||||
exp_doc = ".. _experimental_api_reference:\n\n" + _construct_doc(
|
||||
"langchain_experimental", exp_members
|
||||
)
|
||||
exp_members = load_members(EXP_DIR)
|
||||
exp_doc = ".. _experimental_api_reference:\n\n" + construct_doc("langchain_experimental", exp_members)
|
||||
with open(EXP_WRITE_FILE, "w") as f:
|
||||
f.write(exp_doc)
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
-e libs/langchain
|
||||
-e libs/experimental
|
||||
autodoc_pydantic==1.8.0
|
||||
myst_parser
|
||||
nbsphinx==0.8.9
|
||||
@@ -11,4 +10,4 @@ sphinx-panels
|
||||
toml
|
||||
myst_nb
|
||||
sphinx_copybutton
|
||||
pydata-sphinx-theme==0.13.1
|
||||
pydata-sphinx-theme==0.13.1
|
||||
@@ -3,34 +3,44 @@
|
||||
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
{% if '_value2member_map_' in all_attributes %}
|
||||
{% set classType = "enum" %}
|
||||
{% else %}
|
||||
{% set classType = "default" %}
|
||||
{% endif %}
|
||||
|
||||
.. autoclass:: {{ objname }}
|
||||
|
||||
{% block attributes %}
|
||||
{% if attributes %}
|
||||
.. rubric:: {{ _('Attributes') }}
|
||||
{% if classType == "enum" %}
|
||||
{% if attributes %}
|
||||
.. rubric:: {{ _('Attributes') }}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if attributes %}
|
||||
.. rubric:: {{ _('Attributes') }}
|
||||
{% endif %}
|
||||
|
||||
.. autosummary::
|
||||
{% for item in attributes %}
|
||||
~{{ name }}.{{ item }}
|
||||
{%- endfor %}
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
{% block methods %}
|
||||
{% if methods %}
|
||||
.. rubric:: {{ _('Methods') }}
|
||||
|
||||
{% block methods %}
|
||||
{% if methods %}
|
||||
.. rubric:: {{ _('Methods') }}
|
||||
.. autosummary::
|
||||
{% for item in methods %}
|
||||
~{{ name }}.{{ item }}
|
||||
{%- endfor %}
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
|
||||
.. autosummary::
|
||||
{% for item in methods %}
|
||||
~{{ name }}.{{ item }}
|
||||
{%- endfor %}
|
||||
{% block attributes %}
|
||||
{% if attributes %}
|
||||
.. rubric:: {{ _('Attributes') }}
|
||||
|
||||
{% for item in methods %}
|
||||
.. automethod:: {{ name }}.{{ item }}
|
||||
{%- endfor %}
|
||||
.. autosummary::
|
||||
{% for item in attributes %}
|
||||
~{{ name }}.{{ item }}
|
||||
{%- endfor %}
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
{% endif %}
|
||||
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
|
||||
|
||||
.. example_links:: {{ objname }}
|
||||
.. example_links:: {{ objname }}
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
:mod:`{{module}}`.{{objname}}
|
||||
{{ underline }}==============
|
||||
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
.. autoclass:: {{ objname }}
|
||||
|
||||
{% block attributes %}
|
||||
{% for item in attributes %}
|
||||
.. autoattribute:: {{ item }}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
.. example_links:: {{ objname }}
|
||||
@@ -1,22 +0,0 @@
|
||||
:mod:`{{module}}`.{{objname}}
|
||||
{{ underline }}==============
|
||||
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
.. autopydantic_model:: {{ objname }}
|
||||
:model-show-json: False
|
||||
:model-show-config-summary: False
|
||||
:model-show-validator-members: False
|
||||
:model-show-field-summary: False
|
||||
:field-signature-prefix: param
|
||||
:members:
|
||||
:undoc-members:
|
||||
:inherited-members:
|
||||
:member-order: groupwise
|
||||
:show-inheritance: True
|
||||
:special-members: __call__
|
||||
|
||||
{% block attributes %}
|
||||
{% endblock %}
|
||||
|
||||
.. example_links:: {{ objname }}
|
||||
@@ -1,14 +0,0 @@
|
||||
:mod:`{{module}}`.{{objname}}
|
||||
{{ underline }}==============
|
||||
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
.. autoclass:: {{ objname }}
|
||||
|
||||
{% block attributes %}
|
||||
{% for item in attributes %}
|
||||
.. autoattribute:: {{ item }}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
.. example_links:: {{ objname }}
|
||||
@@ -19,7 +19,7 @@
|
||||
{% block htmltitle %}
|
||||
<title>{{ title|striptags|e }}{{ titlesuffix }}</title>
|
||||
{% endblock %}
|
||||
<link rel="canonical" href="https://api.python.langchain.com/en/latest/{{pagename}}.html" />
|
||||
<link rel="canonical" href="http://scikit-learn.org/stable/{{pagename}}.html" />
|
||||
|
||||
{% if favicon_url %}
|
||||
<link rel="shortcut icon" href="{{ favicon_url|e }}"/>
|
||||
|
||||
@@ -6,6 +6,17 @@
|
||||
{%- set top_container_cls = "sk-landing-container" %}
|
||||
{%- endif %}
|
||||
|
||||
{% if theme_link_to_live_contributing_page|tobool %}
|
||||
{# Link to development page for live builds #}
|
||||
{%- set development_link = "https://scikit-learn.org/dev/developers/index.html" %}
|
||||
{# Open on a new development page in new window/tab for live builds #}
|
||||
{%- set development_attrs = 'target="_blank" rel="noopener noreferrer"' %}
|
||||
{%- else %}
|
||||
{%- set development_link = pathto('developers/index') %}
|
||||
{%- set development_attrs = '' %}
|
||||
{%- endif %}
|
||||
|
||||
|
||||
<nav id="navbar" class="{{ nav_bar_class }} navbar navbar-expand-md navbar-light bg-light py-0">
|
||||
<div class="container-fluid {{ top_container_cls }} px-0">
|
||||
{%- if logo_url %}
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# Community navigator
|
||||
|
||||
Hi! Thanks for being here. We’re lucky to have a community of so many passionate developers building with LangChain–we have so much to teach and learn from each other. Community members contribute code, host meetups, write blog posts, amplify each other’s work, become each other's customers and collaborators, and so much more.
|
||||
|
||||
Whether you’re new to LangChain, looking to go deeper, or just want to get more exposure to the world of building with LLMs, this page can point you in the right direction.
|
||||
|
||||
- **🦜 Contribute to LangChain**
|
||||
|
||||
- **🌍 Meetups, Events, and Hackathons**
|
||||
|
||||
- **📣 Help Us Amplify Your Work**
|
||||
|
||||
- **💬 Stay in the loop**
|
||||
|
||||
|
||||
# 🦜 Contribute to LangChain
|
||||
|
||||
LangChain is the product of over 5,000+ contributions by 1,500+ contributors, and there is ******still****** so much to do together. Here are some ways to get involved:
|
||||
|
||||
- **[Open a pull request](https://github.com/langchain-ai/langchain/issues):** we’d appreciate all forms of contributions–new features, infrastructure improvements, better documentation, bug fixes, etc. If you have an improvement or an idea, we’d love to work on it with you.
|
||||
- **[Read our contributor guidelines:](https://github.com/langchain-ai/langchain/blob/bbd22b9b761389a5e40fc45b0570e1830aabb707/.github/CONTRIBUTING.md)** We ask contributors to follow a ["fork and pull request"](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) workflow, run a few local checks for formatting, linting, and testing before submitting, and follow certain documentation and testing conventions.
|
||||
- **First time contributor?** [Try one of these PRs with the “good first issue” tag](https://github.com/langchain-ai/langchain/contribute).
|
||||
- **Become an expert:** our experts help the community by answering product questions in Discord. If that’s a role you’d like to play, we’d be so grateful! (And we have some special experts-only goodies/perks we can tell you more about). Send us an email to introduce yourself at hello@langchain.dev and we’ll take it from there!
|
||||
- **Integrate with LangChain:** if your product integrates with LangChain–or aspires to–we want to help make sure the experience is as smooth as possible for you and end users. Send us an email at hello@langchain.dev and tell us what you’re working on.
|
||||
- **Become an Integration Maintainer:** Partner with our team to ensure your integration stays up-to-date and talk directly with users (and answer their inquiries) in our Discord. Introduce yourself at hello@langchain.dev if you’d like to explore this role.
|
||||
|
||||
|
||||
# 🌍 Meetups, Events, and Hackathons
|
||||
|
||||
One of our favorite things about working in AI is how much enthusiasm there is for building together. We want to help make that as easy and impactful for you as possible!
|
||||
- **Find a meetup, hackathon, or webinar:** you can find the one for you on on our [global events calendar](https://mirror-feeling-d80.notion.site/0bc81da76a184297b86ca8fc782ee9a3?v=0d80342540df465396546976a50cfb3f).
|
||||
- **Submit an event to our calendar:** email us at events@langchain.dev with a link to your event page! We can also help you spread the word with our local communities.
|
||||
- **Host a meetup:** If you want to bring a group of builders together, we want to help! We can publicize your event on our event calendar/Twitter, share with our local communities in Discord, send swag, or potentially hook you up with a sponsor. Email us at events@langchain.dev to tell us about your event!
|
||||
- **Become a meetup sponsor:** we often hear from groups of builders that want to get together, but are blocked or limited on some dimension (space to host, budget for snacks, prizes to distribute, etc.). If you’d like to help, send us an email to events@langchain.dev we can share more about how it works!
|
||||
- **Speak at an event:** meetup hosts are always looking for great speakers, presenters, and panelists. If you’d like to do that at an event, send us an email to hello@langchain.dev with more information about yourself, what you want to talk about, and what city you’re based in and we’ll try to match you with an upcoming event!
|
||||
- **Tell us about your LLM community:** If you host or participate in a community that would welcome support from LangChain and/or our team, send us an email at hello@langchain.dev and let us know how we can help.
|
||||
|
||||
# 📣 Help Us Amplify Your Work
|
||||
|
||||
If you’re working on something you’re proud of, and think the LangChain community would benefit from knowing about it, we want to help you show it off.
|
||||
|
||||
- **Post about your work and mention us:** we love hanging out on Twitter to see what people in the space are talking about and working on. If you tag [@langchainai](https://twitter.com/LangChainAI), we’ll almost certainly see it and can show you some love.
|
||||
- **Publish something on our blog:** if you’re writing about your experience building with LangChain, we’d love to post (or crosspost) it on our blog! E-mail hello@langchain.dev with a draft of your post! Or even an idea for something you want to write about.
|
||||
- **Get your product onto our [integrations hub](https://integrations.langchain.com/):** Many developers take advantage of our seamless integrations with other products, and come to our integrations hub to find out who those are. If you want to get your product up there, tell us about it (and how it works with LangChain) at hello@langchain.dev.
|
||||
|
||||
# ☀️ Stay in the loop
|
||||
|
||||
Here’s where our team hangs out, talks shop, spotlights cool work, and shares what we’re up to. We’d love to see you there too.
|
||||
|
||||
- **[Twitter](https://twitter.com/LangChainAI):** we post about what we’re working on and what cool things we’re seeing in the space. If you tag @langchainai in your post, we’ll almost certainly see it, and can snow you some love!
|
||||
- **[Discord](https://discord.gg/6adMQxSpJS):** connect with with >30k developers who are building with LangChain
|
||||
- **[GitHub](https://github.com/langchain-ai/langchain):** open pull requests, contribute to a discussion, and/or contribute
|
||||
- **[Subscribe to our bi-weekly Release Notes](https://6w1pwbss0py.typeform.com/to/KjZB1auB):** a twice/month email roundup of the coolest things going on in our orbit
|
||||
- **Slack:** if you’re building an application in production at your company, we’d love to get into a Slack channel together. Fill out [this form](https://airtable.com/appwQzlErAS2qiP0L/shrGtGaVBVAz7NcV2) and we’ll get in touch about setting one up.
|
||||
@@ -8,9 +8,9 @@ import DocCardList from "@theme/DocCardList";
|
||||
|
||||
Building applications with language models involves many moving parts. One of the most critical components is ensuring that the outcomes produced by your models are reliable and useful across a broad array of inputs, and that they work well with your application's other software components. Ensuring reliability usually boils down to some combination of application design, testing & evaluation, and runtime checks.
|
||||
|
||||
The guides in this section review the APIs and functionality LangChain provides to help you better evaluate your applications. Evaluation and testing are both critical when thinking about deploying LLM applications, since production environments require repeatable and useful outcomes.
|
||||
The guides in this section review the APIs and functionality LangChain provides to help yous better evaluate your applications. Evaluation and testing are both critical when thinking about deploying LLM applications, since production environments require repeatable and useful outcomes.
|
||||
|
||||
LangChain offers various types of evaluators to help you measure performance and integrity on diverse data, and we hope to encourage the community to create and share other useful evaluators so everyone can improve. These docs will introduce the evaluator types, how to use them, and provide some examples of their use in real-world scenarios.
|
||||
LangChain offers various types of evaluators to help you measure performance and integrity on diverse data, and we hope to encourage the the community to create and share other useful evaluators so everyone can improve. These docs will introduce the evaluator types, how to use them, and provide some examples of their use in real-world scenarios.
|
||||
|
||||
Each evaluator type in LangChain comes with ready-to-use implementations and an extensible API that allows for customization according to your unique requirements. Here are some of the types of evaluators we offer:
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@ import DocCardList from "@theme/DocCardList";
|
||||
LangSmith helps you trace and evaluate your language model applications and intelligent agents to help you
|
||||
move from prototype to production.
|
||||
|
||||
Check out the [interactive walkthrough](/docs/guides/langsmith/walkthrough) below to get started.
|
||||
Check out the [interactive walkthrough](walkthrough) below to get started.
|
||||
|
||||
For more information, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)
|
||||
|
||||
<DocCardList />
|
||||
<DocCardList />
|
||||
@@ -12,7 +12,7 @@ Here are the agents available in LangChain.
|
||||
|
||||
### [Zero-shot ReAct](/docs/modules/agents/agent_types/react.html)
|
||||
|
||||
This agent uses the [ReAct](https://arxiv.org/pdf/2210.03629) framework to determine which tool to use
|
||||
This agent uses the [ReAct](https://arxiv.org/pdf/2205.00445.pdf) framework to determine which tool to use
|
||||
based solely on the tool's description. Any number of tools can be provided.
|
||||
This agent requires that a description is provided for each tool.
|
||||
|
||||
@@ -28,7 +28,7 @@ navigating around a browser.
|
||||
### [OpenAI Functions](/docs/modules/agents/agent_types/openai_functions_agent.html)
|
||||
|
||||
Certain OpenAI models (like gpt-3.5-turbo-0613 and gpt-4-0613) have been explicitly fine-tuned to detect when a
|
||||
function should be called and respond with the inputs that should be passed to the function.
|
||||
function should to be called and respond with the inputs that should be passed to the function.
|
||||
The OpenAI Functions Agent is designed to work with these models.
|
||||
|
||||
### [Conversational](/docs/modules/agents/agent_types/chat_conversation_agent.html)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# OpenAI functions
|
||||
|
||||
Certain OpenAI models (like gpt-3.5-turbo-0613 and gpt-4-0613) have been fine-tuned to detect when a function should be called and respond with the inputs that should be passed to the function.
|
||||
Certain OpenAI models (like gpt-3.5-turbo-0613 and gpt-4-0613) have been fine-tuned to detect when a function should to be called and respond with the inputs that should be passed to the function.
|
||||
In an API call, you can describe functions and have the model intelligently choose to output a JSON object containing arguments to call those functions.
|
||||
The goal of the OpenAI Function APIs is to more reliably return valid and useful function calls than a generic text completion or chat API.
|
||||
|
||||
|
||||
@@ -18,3 +18,5 @@ Let chains choose which tools to use given high-level directives
|
||||
Persist application state between runs of a chain
|
||||
#### [Callbacks](/docs/modules/callbacks/)
|
||||
Log and stream intermediate steps of any chain
|
||||
#### [Evaluation](/docs/modules/evaluation/)
|
||||
Evaluate the performance of a chain.
|
||||
@@ -3,12 +3,10 @@ sidebar_position: 0
|
||||
---
|
||||
# Prompts
|
||||
|
||||
A prompt for a language model is a set of instructions or input provided by a user to
|
||||
guide the model's response, helping it understand the context and generate relevant
|
||||
and coherent language-based output, such as answering questions, completing sentences,
|
||||
or engaging in a conversation.
|
||||
The new way of programming models is through prompts.
|
||||
A **prompt** refers to the input to the model.
|
||||
This input is often constructed from multiple components.
|
||||
LangChain provides several classes and functions to make constructing and working with prompts easy.
|
||||
|
||||
LangChain provides several classes and functions to help construct and work with prompts.
|
||||
|
||||
- [Prompt templates](/docs/modules/model_io/prompts/prompt_templates/): Parametrized model inputs
|
||||
- [Prompt templates](/docs/modules/model_io/prompts/prompt_templates/): Parametrize model inputs
|
||||
- [Example selectors](/docs/modules/model_io/prompts/example_selectors/): Dynamically select examples to include in prompts
|
||||
@@ -4,15 +4,18 @@ sidebar_position: 0
|
||||
|
||||
# Prompt templates
|
||||
|
||||
Prompt templates are pre-defined recipes for generating prompts for language models.
|
||||
Language models take text as input - that text is commonly referred to as a prompt.
|
||||
Typically this is not simply a hardcoded string but rather a combination of a template, some examples, and user input.
|
||||
LangChain provides several classes and functions to make constructing and working with prompts easy.
|
||||
|
||||
A template may include instructions, few shot examples, and specific context and
|
||||
questions appropriate for a given task.
|
||||
## What is a prompt template?
|
||||
|
||||
LangChain provides tooling to create and work with prompt templates.
|
||||
A prompt template refers to a reproducible way to generate a prompt. It contains a text string ("the template"), that can take in a set of parameters from the end user and generates a prompt.
|
||||
|
||||
LangChain strives to create model agnostic templates to make it easy to reuse
|
||||
existing templates across different language models.
|
||||
A prompt template can contain:
|
||||
- instructions to the language model,
|
||||
- a set of few shot examples to help the language model generate a better response,
|
||||
- a question to the language model.
|
||||
|
||||
import GetStarted from "@snippets/modules/model_io/prompts/prompt_templates/get_started.mdx"
|
||||
|
||||
|
||||
9
docs/docs_skeleton/docs/use_cases/apis/api.mdx
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
sidebar_position: 0
|
||||
---
|
||||
# API chains
|
||||
APIChain enables using LLMs to interact with APIs to retrieve relevant information. Construct the chain by providing a question relevant to the provided API documentation.
|
||||
|
||||
import Example from "@snippets/modules/chains/popular/api.mdx"
|
||||
|
||||
<Example/>
|
||||
@@ -0,0 +1,8 @@
|
||||
# Summarization
|
||||
|
||||
A summarization chain can be used to summarize multiple documents. One way is to input multiple smaller documents, after they have been divided into chunks, and operate over them with a MapReduceDocumentsChain. You can also choose instead for the chain that does summarization to be a StuffDocumentsChain, or a RefineDocumentsChain.
|
||||
|
||||
import Example from "@snippets/modules/chains/popular/summarize.mdx"
|
||||
|
||||
<Example/>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SQL Database Chain
|
||||
# SQL
|
||||
|
||||
This example demonstrates the use of the `SQLDatabaseChain` for answering questions over a SQL database.
|
||||
|
||||
import Example from "@snippets/modules/chains/popular/sqlite.mdx"
|
||||
|
||||
<Example/>
|
||||
<Example/>
|
||||
@@ -1,9 +0,0 @@
|
||||
---
|
||||
sidebar_position: 3
|
||||
---
|
||||
|
||||
# Web Scraping
|
||||
|
||||
Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes.
|
||||
|
||||
Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, LLM-based approach wins out the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process.
|
||||
@@ -128,10 +128,6 @@ const config = {
|
||||
hideable: true,
|
||||
},
|
||||
},
|
||||
colorMode: {
|
||||
disableSwitch: false,
|
||||
respectPrefersColorScheme: true,
|
||||
},
|
||||
prism: {
|
||||
theme: {
|
||||
...baseLightCodeBlockTheme,
|
||||
|
||||
71
docs/docs_skeleton/package-lock.json
generated
@@ -12,7 +12,7 @@
|
||||
"@docusaurus/preset-classic": "2.4.0",
|
||||
"@docusaurus/remark-plugin-npm2yarn": "^2.4.0",
|
||||
"@mdx-js/react": "^1.6.22",
|
||||
"@mendable/search": "^0.0.150",
|
||||
"@mendable/search": "^0.0.125",
|
||||
"clsx": "^1.2.1",
|
||||
"json-loader": "^0.5.7",
|
||||
"process": "^0.11.10",
|
||||
@@ -3212,11 +3212,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/search": {
|
||||
"version": "0.0.150",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/search/-/search-0.0.150.tgz",
|
||||
"integrity": "sha512-Eb5SeAWlMxzEim/8eJ/Ysn01Pyh39xlPBzRBw/5OyOBhti0HVLXk4wd1Fq2TKgJC2ppQIvhEKO98PUcj9dNDFw==",
|
||||
"version": "0.0.125",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/search/-/search-0.0.125.tgz",
|
||||
"integrity": "sha512-Mb1J3zDhOyBZV9cXqJocSOBNYGpe8+LQDqd9n9laPWxosSJcSTUewqtlIbMerrYsScBsxskoSiWgRsc7xF5z0Q==",
|
||||
"dependencies": {
|
||||
"html-react-parser": "^4.2.0",
|
||||
"posthog-js": "^1.45.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
@@ -8333,33 +8332,6 @@
|
||||
"safe-buffer": "~5.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/html-dom-parser": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/html-dom-parser/-/html-dom-parser-4.0.0.tgz",
|
||||
"integrity": "sha512-TUa3wIwi80f5NF8CVWzkopBVqVAtlawUzJoLwVLHns0XSJGynss4jiY0mTWpiDOsuyw+afP+ujjMgRh9CoZcXw==",
|
||||
"dependencies": {
|
||||
"domhandler": "5.0.3",
|
||||
"htmlparser2": "9.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/html-dom-parser/node_modules/htmlparser2": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.0.0.tgz",
|
||||
"integrity": "sha512-uxbSI98wmFT/G4P2zXx4OVx04qWUmyFPrD2/CNepa2Zo3GPNaCaaxElDgwUrwYWkK1nr9fft0Ya8dws8coDLLQ==",
|
||||
"funding": [
|
||||
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.1.0",
|
||||
"entities": "^4.5.0"
|
||||
}
|
||||
},
|
||||
"node_modules/html-entities": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.4.0.tgz",
|
||||
@@ -8403,20 +8375,6 @@
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/html-react-parser": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/html-react-parser/-/html-react-parser-4.2.0.tgz",
|
||||
"integrity": "sha512-gzU55AS+FI6qD7XaKe5BLuLFM2Xw0/LodfMWZlxV9uOHe7LCD5Lukx/EgYuBI3c0kLu0XlgFXnSzO0qUUn3Vrg==",
|
||||
"dependencies": {
|
||||
"domhandler": "5.0.3",
|
||||
"html-dom-parser": "4.0.0",
|
||||
"react-property": "2.0.0",
|
||||
"style-to-js": "1.1.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "0.14 || 15 || 16 || 17 || 18"
|
||||
}
|
||||
},
|
||||
"node_modules/html-tags": {
|
||||
"version": "3.3.1",
|
||||
"resolved": "https://registry.npmjs.org/html-tags/-/html-tags-3.3.1.tgz",
|
||||
@@ -11804,11 +11762,6 @@
|
||||
"webpack": ">=4.41.1 || 5.x"
|
||||
}
|
||||
},
|
||||
"node_modules/react-property": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/react-property/-/react-property-2.0.0.tgz",
|
||||
"integrity": "sha512-kzmNjIgU32mO4mmH5+iUyrqlpFQhF8K2k7eZ4fdLSOPFrD1XgEuSBv9LDEgxRXTMBqMd8ppT0x6TIzqE5pdGdw=="
|
||||
},
|
||||
"node_modules/react-router": {
|
||||
"version": "5.3.4",
|
||||
"resolved": "https://registry.npmjs.org/react-router/-/react-router-5.3.4.tgz",
|
||||
@@ -13174,22 +13127,6 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/style-to-js": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/style-to-js/-/style-to-js-1.1.3.tgz",
|
||||
"integrity": "sha512-zKI5gN/zb7LS/Vm0eUwjmjrXWw8IMtyA8aPBJZdYiQTXj4+wQ3IucOLIOnF7zCHxvW8UhIGh/uZh/t9zEHXNTQ==",
|
||||
"dependencies": {
|
||||
"style-to-object": "0.4.1"
|
||||
}
|
||||
},
|
||||
"node_modules/style-to-js/node_modules/style-to-object": {
|
||||
"version": "0.4.1",
|
||||
"resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-0.4.1.tgz",
|
||||
"integrity": "sha512-HFpbb5gr2ypci7Qw+IOhnP2zOU7e77b+rzM+wTzXzfi1PrtBCX0E7Pk4wL4iTLnhzZ+JgEGAhX81ebTg/aYjQw==",
|
||||
"dependencies": {
|
||||
"inline-style-parser": "0.1.1"
|
||||
}
|
||||
},
|
||||
"node_modules/style-to-object": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-0.3.0.tgz",
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
"@docusaurus/preset-classic": "2.4.0",
|
||||
"@docusaurus/remark-plugin-npm2yarn": "^2.4.0",
|
||||
"@mdx-js/react": "^1.6.22",
|
||||
"@mendable/search": "^0.0.150",
|
||||
"@mendable/search": "^0.0.125",
|
||||
"clsx": "^1.2.1",
|
||||
"json-loader": "^0.5.7",
|
||||
"process": "^0.11.10",
|
||||
|
||||
@@ -75,7 +75,6 @@ module.exports = {
|
||||
slug: "additional_resources",
|
||||
},
|
||||
},
|
||||
'community'
|
||||
],
|
||||
integrations: [
|
||||
{
|
||||
|
||||
|
Before Width: | Height: | Size: 232 KiB |
|
Before Width: | Height: | Size: 405 KiB |
|
Before Width: | Height: | Size: 471 KiB |
|
Before Width: | Height: | Size: 520 KiB |
|
Before Width: | Height: | Size: 98 KiB |
|
Before Width: | Height: | Size: 117 KiB |
|
Before Width: | Height: | Size: 93 KiB |
|
Before Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 307 KiB |
|
Before Width: | Height: | Size: 193 KiB |
|
Before Width: | Height: | Size: 190 KiB |
|
Before Width: | Height: | Size: 125 KiB |
|
Before Width: | Height: | Size: 131 KiB |
|
Before Width: | Height: | Size: 211 KiB |
|
Before Width: | Height: | Size: 132 KiB |
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 266 KiB |
|
Before Width: | Height: | Size: 196 KiB |
|
Before Width: | Height: | Size: 90 KiB |
|
Before Width: | Height: | Size: 174 KiB |
|
Before Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 130 KiB |
|
Before Width: | Height: | Size: 152 KiB |
|
Before Width: | Height: | Size: 172 KiB |
|
Before Width: | Height: | Size: 716 KiB |
@@ -1 +0,0 @@
|
||||
label: 'Adapters'
|
||||
@@ -1,323 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "700a516b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenAI Adapter\n",
|
||||
"\n",
|
||||
"A lot of people get started with OpenAI but want to explore other models. LangChain's integrations with many model providers make this easy to do so. While LangChain has it's own message and model APIs, we've also made it as easy as possible to explore other models by exposing an adapter to adapt LangChain models to the OpenAI api.\n",
|
||||
"\n",
|
||||
"At the moment this only deals with output and does not return other information (token counts, stop reasons, etc)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6017f26a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"from langchain.adapters import openai as lc_openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b522ceda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ChatCompletion.create"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "1d22eb61",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [{\"role\": \"user\", \"content\": \"hi\"}]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d550d3ad",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Original OpenAI call"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "e1d27dfa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = openai.ChatCompletion.create(\n",
|
||||
" messages=messages, \n",
|
||||
" model=\"gpt-3.5-turbo\", \n",
|
||||
" temperature=0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "012d81ae",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'role': 'assistant', 'content': 'Hello! How can I assist you today?'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result[\"choices\"][0]['message'].to_dict_recursive()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "db5b5500",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"LangChain OpenAI wrapper call"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "87c2d515",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lc_result = lc_openai.ChatCompletion.create(\n",
|
||||
" messages=messages, \n",
|
||||
" model=\"gpt-3.5-turbo\", \n",
|
||||
" temperature=0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "c67a5ac8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'role': 'assistant', 'content': 'Hello! How can I assist you today?'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"lc_result[\"choices\"][0]['message']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "034ba845",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Swapping out model providers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "7a2c011c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lc_result = lc_openai.ChatCompletion.create(\n",
|
||||
" messages=messages, \n",
|
||||
" model=\"claude-2\", \n",
|
||||
" temperature=0, \n",
|
||||
" provider=\"ChatAnthropic\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "f7c94827",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'role': 'assistant', 'content': ' Hello!'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"lc_result[\"choices\"][0]['message']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb3f181d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ChatCompletion.stream"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f7b8cd18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Original OpenAI call"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "fd8cb1ea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'role': 'assistant', 'content': ''}\n",
|
||||
"{'content': 'Hello'}\n",
|
||||
"{'content': '!'}\n",
|
||||
"{'content': ' How'}\n",
|
||||
"{'content': ' can'}\n",
|
||||
"{'content': ' I'}\n",
|
||||
"{'content': ' assist'}\n",
|
||||
"{'content': ' you'}\n",
|
||||
"{'content': ' today'}\n",
|
||||
"{'content': '?'}\n",
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for c in openai.ChatCompletion.create(\n",
|
||||
" messages = messages,\n",
|
||||
" model=\"gpt-3.5-turbo\", \n",
|
||||
" temperature=0,\n",
|
||||
" stream=True\n",
|
||||
"):\n",
|
||||
" print(c[\"choices\"][0]['delta'].to_dict_recursive())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b2a076b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"LangChain OpenAI wrapper call"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "9521218c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'role': 'assistant', 'content': ''}\n",
|
||||
"{'content': 'Hello'}\n",
|
||||
"{'content': '!'}\n",
|
||||
"{'content': ' How'}\n",
|
||||
"{'content': ' can'}\n",
|
||||
"{'content': ' I'}\n",
|
||||
"{'content': ' assist'}\n",
|
||||
"{'content': ' you'}\n",
|
||||
"{'content': ' today'}\n",
|
||||
"{'content': '?'}\n",
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for c in lc_openai.ChatCompletion.create(\n",
|
||||
" messages = messages,\n",
|
||||
" model=\"gpt-3.5-turbo\", \n",
|
||||
" temperature=0,\n",
|
||||
" stream=True\n",
|
||||
"):\n",
|
||||
" print(c[\"choices\"][0]['delta'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0fc39750",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Swapping out model providers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "68f0214e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'role': 'assistant', 'content': ' Hello'}\n",
|
||||
"{'content': '!'}\n",
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for c in lc_openai.ChatCompletion.create(\n",
|
||||
" messages = messages,\n",
|
||||
" model=\"claude-2\", \n",
|
||||
" temperature=0,\n",
|
||||
" stream=True,\n",
|
||||
" provider=\"ChatAnthropic\",\n",
|
||||
"):\n",
|
||||
" print(c[\"choices\"][0]['delta'])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -25,7 +25,16 @@
|
||||
"execution_count": 1,
|
||||
"id": "466b65b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.14) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"from langchain.chat_models import ChatOpenAI"
|
||||
@@ -33,7 +42,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 2,
|
||||
"id": "3c634ef0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -171,7 +180,9 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "decf7710",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
@@ -200,7 +211,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 7,
|
||||
"id": "f799664d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -345,7 +356,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 19,
|
||||
"id": "5d3d8ffe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -366,7 +377,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 20,
|
||||
"id": "33be32af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -378,7 +389,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 21,
|
||||
"id": "df3f3fa2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -390,7 +401,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 22,
|
||||
"id": "bfc47ec1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -405,7 +416,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 24,
|
||||
"id": "eae31755",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -420,9 +431,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 25,
|
||||
"id": "f3040b0c",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
@@ -437,7 +450,7 @@
|
||||
"'Harrison worked at Kensho.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -448,7 +461,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 27,
|
||||
"id": "e1d20c7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -471,9 +484,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 28,
|
||||
"id": "7ee8b2d4",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
@@ -488,7 +503,7 @@
|
||||
"'Harrison ha lavorato a Kensho.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -497,317 +512,6 @@
|
||||
"chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f007669c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Conversational Retrieval Chain\n",
|
||||
"\n",
|
||||
"We can easily add in conversation history. This primarily means adding in chat_message_history"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "3f30c348",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.schema.runnable import RunnableMap\n",
|
||||
"from langchain.schema import format_document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "64ab1dbf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"\n",
|
||||
"_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n",
|
||||
"\n",
|
||||
"Chat History:\n",
|
||||
"{chat_history}\n",
|
||||
"Follow Up Input: {question}\n",
|
||||
"Standalone question:\"\"\"\n",
|
||||
"CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "7d628c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Answer the question based only on the following context:\n",
|
||||
"{context}\n",
|
||||
"\n",
|
||||
"Question: {question}\n",
|
||||
"\"\"\"\n",
|
||||
"ANSWER_PROMPT = ChatPromptTemplate.from_template(template)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f60a5d0f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n",
|
||||
"def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n",
|
||||
" doc_strings = [format_document(doc, document_prompt) for doc in docs]\n",
|
||||
" return document_separator.join(doc_strings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7d007db6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Tuple, List\n",
|
||||
"def _format_chat_history(chat_history: List[Tuple]) -> str:\n",
|
||||
" buffer = \"\"\n",
|
||||
" for dialogue_turn in chat_history:\n",
|
||||
" human = \"Human: \" + dialogue_turn[0]\n",
|
||||
" ai = \"Assistant: \" + dialogue_turn[1]\n",
|
||||
" buffer += \"\\n\" + \"\\n\".join([human, ai])\n",
|
||||
" return buffer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "5c32cc89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"_inputs = RunnableMap(\n",
|
||||
" {\n",
|
||||
" \"standalone_question\": {\n",
|
||||
" \"question\": lambda x: x[\"question\"],\n",
|
||||
" \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n",
|
||||
" } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"_context = {\n",
|
||||
" \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n",
|
||||
" \"question\": lambda x: x[\"standalone_question\"]\n",
|
||||
"}\n",
|
||||
"conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "135c8205",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"conversational_qa_chain.invoke({\n",
|
||||
" \"question\": \"where did harrison work?\",\n",
|
||||
" \"chat_history\": [],\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "424e7e7a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"conversational_qa_chain.invoke({\n",
|
||||
" \"question\": \"where did he work?\",\n",
|
||||
" \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5543183",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### With Memory and returning source documents\n",
|
||||
"\n",
|
||||
"This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "e31dd17c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.memory import ConversationBufferMemory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "d4bffe94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "733be985",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# First we add a step to load memory\n",
|
||||
"# This needs to be a RunnableMap because its the first input\n",
|
||||
"loaded_memory = RunnableMap(\n",
|
||||
" {\n",
|
||||
" \"question\": itemgetter(\"question\"),\n",
|
||||
" \"memory\": memory.load_memory_variables,\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"# Next we add a step to expand memory into the variables\n",
|
||||
"expanded_memory = {\n",
|
||||
" \"question\": itemgetter(\"question\"),\n",
|
||||
" \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Now we calculate the standalone question\n",
|
||||
"standalone_question = {\n",
|
||||
" \"standalone_question\": {\n",
|
||||
" \"question\": lambda x: x[\"question\"],\n",
|
||||
" \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n",
|
||||
" } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n",
|
||||
"}\n",
|
||||
"# Now we retrieve the documents\n",
|
||||
"retrieved_documents = {\n",
|
||||
" \"docs\": itemgetter(\"standalone_question\") | retriever,\n",
|
||||
" \"question\": lambda x: x[\"standalone_question\"]\n",
|
||||
"}\n",
|
||||
"# Now we construct the inputs for the final prompt\n",
|
||||
"final_inputs = {\n",
|
||||
" \"context\": lambda x: _combine_documents(x[\"docs\"]),\n",
|
||||
" \"question\": itemgetter(\"question\")\n",
|
||||
"}\n",
|
||||
"# And finally, we do the part that returns the answers\n",
|
||||
"answer = {\n",
|
||||
" \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n",
|
||||
" \"docs\": itemgetter(\"docs\"),\n",
|
||||
"}\n",
|
||||
"# And now we put it all together!\n",
|
||||
"final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "806e390c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n",
|
||||
" 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"inputs = {\"question\": \"where did harrison work?\"}\n",
|
||||
"result = final_chain.invoke(inputs)\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "977399fd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Note that the memory does not save automatically\n",
|
||||
"# This will be improved in the future\n",
|
||||
"# For now you need to save it yourself\n",
|
||||
"memory.save_context(inputs, {\"answer\": result[\"answer\"].content})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "f94f7de4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n",
|
||||
" AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"memory.load_memory_variables({})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f2bf8d3",
|
||||
@@ -1379,265 +1083,13 @@
|
||||
"chain.invoke({\"input\": \"whats 2 plus 2\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5062941a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Memory\n",
|
||||
"\n",
|
||||
"This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 99,
|
||||
"id": "7998efd8",
|
||||
"execution_count": null,
|
||||
"id": "9be88499",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.memory import ConversationBufferMemory\n",
|
||||
"from langchain.schema.runnable import RunnableMap\n",
|
||||
"from langchain.prompts import MessagesPlaceholder\n",
|
||||
"model = ChatOpenAI()\n",
|
||||
"prompt = ChatPromptTemplate.from_messages([\n",
|
||||
" (\"system\", \"You are a helpful chatbot\"),\n",
|
||||
" MessagesPlaceholder(variable_name=\"history\"),\n",
|
||||
" (\"human\", \"{input}\")\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"id": "fa0087f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"memory = ConversationBufferMemory(return_messages=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 101,
|
||||
"id": "06b531ae",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'history': []}"
|
||||
]
|
||||
},
|
||||
"execution_count": 101,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"memory.load_memory_variables({})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 102,
|
||||
"id": "d9437af6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = RunnableMap({\n",
|
||||
" \"input\": lambda x: x[\"input\"],\n",
|
||||
" \"memory\": memory.load_memory_variables\n",
|
||||
"}) | {\n",
|
||||
" \"input\": lambda x: x[\"input\"],\n",
|
||||
" \"history\": lambda x: x[\"memory\"][\"history\"]\n",
|
||||
"} | prompt | model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 103,
|
||||
"id": "bed1e260",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 103,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"inputs = {\"input\": \"hi im bob\"}\n",
|
||||
"response = chain.invoke(inputs)\n",
|
||||
"response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"id": "890475b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"memory.save_context(inputs, {\"output\": response.content})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"id": "e8fcb77f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n",
|
||||
" AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 105,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"memory.load_memory_variables({})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"id": "d837d5c3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Your name is Bob. You mentioned it in your previous message. Is there anything else I can help you with, Bob?', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 106,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"inputs = {\"input\": \"whats my name\"}\n",
|
||||
"response = chain.invoke(inputs)\n",
|
||||
"response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4927a727-b4c8-453c-8c83-bd87b4fcac14",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Moderation\n",
|
||||
"\n",
|
||||
"This shows how to add in moderation (or other safeguards) around your LLM application."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import OpenAIModerationChain\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"moderate = OpenAIModerationChain()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = OpenAI()\n",
|
||||
"prompt = ChatPromptTemplate.from_messages([\n",
|
||||
" (\"system\", \"repeat after me: {input}\")\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = prompt | model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\nYou are stupid.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke({\"input\": \"you are stupid\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"moderated_chain = chain | moderate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'input': '\\n\\nYou are stupid.',\n",
|
||||
" 'output': \"Text was found that violates OpenAI's content policy.\"}"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"moderated_chain.invoke({\"input\": \"you are stupid\"})"
|
||||
]
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -1656,7 +1108,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -19,29 +19,16 @@
|
||||
"- `ainvoke`: call the chain on an input async\n",
|
||||
"- `abatch`: call the chain on a list of inputs async\n",
|
||||
"\n",
|
||||
"The type of the input varies by component:\n",
|
||||
"The type of the input varies by component. For a prompt it is a dictionary, for a retriever it is a single string, for a model either a single string, a list of chat messages, or a PromptValue.\n",
|
||||
"\n",
|
||||
"| Component | Input Type |\n",
|
||||
"| --- | --- |\n",
|
||||
"|Prompt|Dictionary|\n",
|
||||
"|Retriever|Single string|\n",
|
||||
"|Model| Single string, list of chat messages or a PromptValue|\n",
|
||||
"\n",
|
||||
"The output type also varies by component:\n",
|
||||
"\n",
|
||||
"| Component | Output Type |\n",
|
||||
"| --- | --- |\n",
|
||||
"| LLM | String |\n",
|
||||
"| ChatModel | ChatMessage |\n",
|
||||
"| Prompt | PromptValue |\n",
|
||||
"| Retriever | List of documents |\n",
|
||||
"The output type also varies by component. For an LLM it is a string, for a ChatModel it's a ChatMessage, for a prompt it's a PromptValue, for a retriever it's a list of documents.\n",
|
||||
"\n",
|
||||
"Let's take a look at these methods! To do so, we'll create a super simple PromptTemplate + ChatModel chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 20,
|
||||
"id": "466b65b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -108,7 +95,7 @@
|
||||
],
|
||||
"source": [
|
||||
"for s in chain.stream({\"topic\": \"bears\"}):\n",
|
||||
" print(s.content, end=\"\", flush=True)"
|
||||
" print(s.content, end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -196,7 +183,7 @@
|
||||
],
|
||||
"source": [
|
||||
"async for s in chain.astream({\"topic\": \"bears\"}):\n",
|
||||
" print(s.content, end=\"\", flush=True)"
|
||||
" print(s.content, end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -256,131 +243,6 @@
|
||||
"source": [
|
||||
"await chain.abatch([{\"topic\": \"bears\"}])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a1c409d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Parallelism\n",
|
||||
"\n",
|
||||
"Let's take a look at how LangChain Expression Language support parralel requests as much as possible. For example, when using a RunnableMapping (often written as a dictionary) it executes each element in parralel."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e3014c7a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.schema.runnable import RunnableMap\n",
|
||||
"chain1 = ChatPromptTemplate.from_template(\"tell me a joke about {topic}\") | model\n",
|
||||
"chain2 = ChatPromptTemplate.from_template(\"write a short (2 line) poem about {topic}\") | model\n",
|
||||
"combined = RunnableMap({\n",
|
||||
" \"joke\": chain1,\n",
|
||||
" \"poem\": chain2,\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "08044c0a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 31.7 ms, sys: 8.59 ms, total: 40.3 ms\n",
|
||||
"Wall time: 1.05 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Why don't bears like fast food?\\n\\nBecause they can't catch it!\", additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"chain1.invoke({\"topic\": \"bears\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "22c56804",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 42.9 ms, sys: 10.2 ms, total: 53 ms\n",
|
||||
"Wall time: 1.93 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"In forest's embrace, bears roam free,\\nSilent strength, nature's majesty.\", additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"chain2.invoke({\"topic\": \"bears\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "4fff4cbb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 96.3 ms, sys: 20.4 ms, total: 117 ms\n",
|
||||
"Wall time: 1.1 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'joke': AIMessage(content=\"Why don't bears wear socks?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False),\n",
|
||||
" 'poem': AIMessage(content=\"In forest's embrace,\\nMajestic bears leave their trace.\", additional_kwargs={}, example=False)}"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"combined.invoke({\"topic\": \"bears\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fab75d1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -399,7 +261,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,430 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19c9cbd6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fallbacks\n",
|
||||
"\n",
|
||||
"When working with language models, you may often encounter issues from the underlying APIs, whether these be rate limiting or downtime. Therefore, as you go to move your LLM applications into production it becomes more and more important to safe guard against these. That's why we've introduced the concept of fallbacks.\n",
|
||||
"\n",
|
||||
"Crucially, fallbacks can be applied not only on the LLM level but on the whole runnable level. This is important because often times different models require different prompts. So if your call to OpenAI fails, you don't just want to send the same prompt to Anthropic - you probably want want to use a different prompt template and send a different version there."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a6bb9ba9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Handling LLM API Errors\n",
|
||||
"\n",
|
||||
"This is maybe the most common use case for fallbacks. A request to an LLM API can fail for a variety of reasons - the API could be down, you could have hit rate limits, any number of things. Therefor, using fallbacks can help protect against these types of things.\n",
|
||||
"\n",
|
||||
"IMPORTANT: By default, a lot of the LLM wrappers catch errors and retry. You will most likely want to turn those off when working with fallbacks. Otherwise the first wrapper will keep on retying and not failing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "d3e893bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI, ChatAnthropic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4847c82d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, let's mock out what happens if we hit a RateLimitError from OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "dfdd8bf5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from unittest.mock import patch\n",
|
||||
"from openai.error import RateLimitError"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "e6fdffc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Note that we set max_retries = 0 to avoid retrying on RateLimits, etc\n",
|
||||
"openai_llm = ChatOpenAI(max_retries=0)\n",
|
||||
"anthropic_llm = ChatAnthropic()\n",
|
||||
"llm = openai_llm.with_fallbacks([anthropic_llm])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "584461ab",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hit error\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Let's use just the OpenAI LLm first, to show that we run into an error\n",
|
||||
"with patch('openai.ChatCompletion.create', side_effect=RateLimitError()):\n",
|
||||
" try:\n",
|
||||
" print(openai_llm.invoke(\"Why did the the chicken cross the road?\"))\n",
|
||||
" except:\n",
|
||||
" print(\"Hit error\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "4fc1e673",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"content=' I don\\'t actually know why the chicken crossed the road, but here are some possible humorous answers:\\n\\n- To get to the other side!\\n\\n- It was too chicken to just stand there. \\n\\n- It wanted a change of scenery.\\n\\n- It wanted to show the possum it could be done.\\n\\n- It was on its way to a poultry farmers\\' convention.\\n\\nThe joke plays on the double meaning of \"the other side\" - literally crossing the road to the other side, or the \"other side\" meaning the afterlife. So it\\'s an anti-joke, with a silly or unexpected pun as the answer.' additional_kwargs={} example=False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Now let's try with fallbacks to Anthropic\n",
|
||||
"with patch('openai.ChatCompletion.create', side_effect=RateLimitError()):\n",
|
||||
" try:\n",
|
||||
" print(llm.invoke(\"Why did the the chicken cross the road?\"))\n",
|
||||
" except:\n",
|
||||
" print(\"Hit error\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f00bea25",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can use our \"LLM with Fallbacks\" as we would a normal LLM."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "4f8eaaa0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"content=\" I don't actually know why the kangaroo crossed the road, but I can take a guess! Here are some possible reasons:\\n\\n- To get to the other side (the classic joke answer!)\\n\\n- It was trying to find some food or water \\n\\n- It was trying to find a mate during mating season\\n\\n- It was fleeing from a predator or perceived threat\\n\\n- It was disoriented and crossed accidentally \\n\\n- It was following a herd of other kangaroos who were crossing\\n\\n- It wanted a change of scenery or environment \\n\\n- It was trying to reach a new habitat or territory\\n\\nThe real reason is unknown without more context, but hopefully one of those potential explanations does the joke justice! Let me know if you have any other animal jokes I can try to decipher.\" additional_kwargs={} example=False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"\n",
|
||||
"prompt = ChatPromptTemplate.from_messages(\n",
|
||||
" [\n",
|
||||
" (\"system\", \"You're a nice assistant who always includes a compliment in your response\"),\n",
|
||||
" (\"human\", \"Why did the {animal} cross the road\"),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"chain = prompt | llm\n",
|
||||
"with patch('openai.ChatCompletion.create', side_effect=RateLimitError()):\n",
|
||||
" try:\n",
|
||||
" print(chain.invoke({\"animal\": \"kangaroo\"}))\n",
|
||||
" except:\n",
|
||||
" print(\"Hit error\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8d62241b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Fallbacks for Sequences\n",
|
||||
"\n",
|
||||
"We can also create fallbacks for sequences, that are sequences themselves. Here we do that with two different models: ChatOpenAI and then normal OpenAI (which does not use a chat model). Because OpenAI is NOT a chat model, you likely want a different prompt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "6d0b8056",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# First let's create a chain with a ChatModel\n",
|
||||
"# We add in a string output parser here so the outputs between the two are the same type\n",
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"\n",
|
||||
"chat_prompt = ChatPromptTemplate.from_messages(\n",
|
||||
" [\n",
|
||||
" (\"system\", \"You're a nice assistant who always includes a compliment in your response\"),\n",
|
||||
" (\"human\", \"Why did the {animal} cross the road\"),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"# Here we're going to use a bad model name to easily create a chain that will error\n",
|
||||
"chat_model = ChatOpenAI(model_name=\"gpt-fake\")\n",
|
||||
"bad_chain = chat_prompt | chat_model | StrOutputParser()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "8d1fc2a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now lets create a chain with the normal OpenAI model\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"prompt_template = \"\"\"Instructions: You should always include a compliment in your response.\n",
|
||||
"\n",
|
||||
"Question: Why did the {animal} cross the road?\"\"\"\n",
|
||||
"prompt = PromptTemplate.from_template(prompt_template)\n",
|
||||
"llm = OpenAI()\n",
|
||||
"good_chain = prompt | llm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "283bfa44",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\nAnswer: The turtle crossed the road to get to the other side, and I have to say he had some impressive determination.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We can now create a final chain which combines the two\n",
|
||||
"chain = bad_chain.with_fallbacks([good_chain])\n",
|
||||
"chain.invoke({\"animal\": \"turtle\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ec4685b4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Handling Long Inputs\n",
|
||||
"\n",
|
||||
"One of the big limiting factors of LLMs in their context window. Usually you can count and track the length of prompts before sending them to an LLM, but in situations where that is hard/complicated you can fallback to a model with longer context length."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "564b84c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"short_llm = ChatOpenAI()\n",
|
||||
"long_llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\")\n",
|
||||
"llm = short_llm.with_fallbacks([long_llm])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "5e27a775",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inputs = \"What is the next number: \" + \", \".join([\"one\", \"two\"] * 3000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "0a502731",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"This model's maximum context length is 4097 tokens. However, your messages resulted in 12012 tokens. Please reduce the length of the messages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" print(short_llm.invoke(inputs))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(e)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "d91ba5d7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"content='The next number in the sequence is two.' additional_kwargs={} example=False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" print(llm.invoke(inputs))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(e)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a6735df",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Fallback to Better Model\n",
|
||||
"\n",
|
||||
"Often times we ask models to output format in a specific format (like JSON). Models like GPT-3.5 can do this okay, but sometimes struggle. This naturally points to fallbacks - we can try with GPT-3.5 (faster, cheaper), but then if parsing fails we can use GPT-4."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "867a3793",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.output_parsers import DatetimeOutputParser"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"id": "b8d9959d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompt = ChatPromptTemplate.from_template(\n",
|
||||
" \"what time was {event} (in %Y-%m-%dT%H:%M:%S.%fZ format - only return this value)\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 75,
|
||||
"id": "98087a76",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# In this case we are going to do the fallbacks on the LLM + output parser level\n",
|
||||
"# Because the error will get raised in the OutputParser\n",
|
||||
"openai_35 = ChatOpenAI() | DatetimeOutputParser()\n",
|
||||
"openai_4 = ChatOpenAI(model=\"gpt-4\")| DatetimeOutputParser()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"id": "17ec9e8f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"only_35 = prompt | openai_35 \n",
|
||||
"fallback_4 = prompt | openai_35.with_fallbacks([openai_4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"id": "7e536f0b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error: Could not parse datetime string: The Super Bowl in 1994 took place on January 30th at 3:30 PM local time. Converting this to the specified format (%Y-%m-%dT%H:%M:%S.%fZ) results in: 1994-01-30T15:30:00.000Z\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" print(only_35.invoke({\"event\": \"the superbowl in 1994\"}))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(f\"Error: {e}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 81,
|
||||
"id": "01355c5e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1994-01-30 15:30:00\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" print(fallback_4.invoke({\"event\": \"the superbowl in 1994\"}))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(f\"Error: {e}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c537f9d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,807 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8982428",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Private, local, open source LLMs\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
"\n",
|
||||
"The popularity of projects like [PrivateGPT](https://github.com/imartinez/privateGPT), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [GPT4All](https://github.com/nomic-ai/gpt4all) underscore the demand to run LLMs locally (on your own device).\n",
|
||||
"\n",
|
||||
"This has at least two important benefits:\n",
|
||||
"\n",
|
||||
"1. `Privacy`: Your data is not sent to a third party, and it is not subject to the terms of service of a commercial service\n",
|
||||
"2. `Cost`: There is no inference fee, which is important for token-intensive applications (e.g., [long-running simulations](https://twitter.com/RLanceMartin/status/1691097659262820352?s=20), summarization)\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"Running an LLM locally requires a few things:\n",
|
||||
"\n",
|
||||
"1. `Open source LLM`: An open source LLM that can be freely modified and shared \n",
|
||||
"2. `Inference`: Ability to run this LLM on your device w/ acceptable latency\n",
|
||||
"\n",
|
||||
"### Open Source LLMs\n",
|
||||
"\n",
|
||||
"Users can now gain access to a rapidly growing set of [open source LLMs](https://cameronrwolfe.substack.com/p/the-history-of-open-source-llms-better). \n",
|
||||
"\n",
|
||||
"These LLMs can be assessed across at least two dimentions (see figure):\n",
|
||||
" \n",
|
||||
"1. `Base model`: What is the base-model and how was it trained?\n",
|
||||
"2. `Fine-tuning approach`: Was the base-model fine-tuned and, if so, what [set of instructions](https://cameronrwolfe.substack.com/p/beyond-llama-the-power-of-open-llms#%C2%A7alpaca-an-instruction-following-llama-model) was used?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The relative performance of these models can be assessed using several leaderboards, including:\n",
|
||||
"\n",
|
||||
"1. [LmSys](https://chat.lmsys.org/?arena)\n",
|
||||
"2. [GPT4All](https://gpt4all.io/index.html)\n",
|
||||
"3. [HuggingFace](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard)\n",
|
||||
"\n",
|
||||
"### Inference\n",
|
||||
"\n",
|
||||
"A few frameworks for this have emerged to support inference of open source LLMs on various devices:\n",
|
||||
"\n",
|
||||
"1. [`llama.cpp`](https://github.com/ggerganov/llama.cpp): C++ implementation of llama inference code with [weight optimization / quantization](https://finbarr.ca/how-is-llama-cpp-possible/)\n",
|
||||
"2. [`gpt4all`](https://docs.gpt4all.io/index.html): Optimized C backend for inference\n",
|
||||
"3. [`Ollama`](https://ollama.ai/): Bundles model weights and environment into an app that runs on device and serves the LLM \n",
|
||||
"\n",
|
||||
"In general, these frameworks will do a few things:\n",
|
||||
"\n",
|
||||
"1. `Quantization`: Reduce the memory footprint of the raw model weights\n",
|
||||
"2. `Efficient implementation for inference`: Support inference on consumer hardware (e.g., CPU or laptop GPU)\n",
|
||||
"\n",
|
||||
"In particular, see [this excellent post](https://finbarr.ca/how-is-llama-cpp-possible/) on the importance of quantization.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"With less precision, we radically decrease the memory needed to store the LLM in memory.\n",
|
||||
"\n",
|
||||
"In addition, we can see the importance of GPU memory bandwidth [sheet](https://docs.google.com/spreadsheets/d/1OehfHHNSn66BP2h3Bxp2NJTVX97icU0GmCXF6pK23H8/edit#gid=0)!\n",
|
||||
"\n",
|
||||
"A Mac M2 Max is 5-6x faster than a M1 for inference due to the larger GPU memory bandwidth.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Quickstart\n",
|
||||
"\n",
|
||||
"[`Ollama`](https://ollama.ai/) is one way to easily run inference on macOS.\n",
|
||||
" \n",
|
||||
"The instructions [here](docs/integrations/llms/ollama) provide details, which we summarize:\n",
|
||||
" \n",
|
||||
"* [Download and run](https://ollama.ai/download) the app\n",
|
||||
"* From command line, fetch a model from this [list of options](https://github.com/jmorganca/ollama): e.g., `ollama pull llama2`\n",
|
||||
"* When the app is running, all models are automatically served on `localhost:11434`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "86178adb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' The first man on the moon was Neil Armstrong, who landed on the moon on July 20, 1969 as part of the Apollo 11 mission. obviously.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.llms import Ollama\n",
|
||||
"llm = Ollama(model=\"llama2\")\n",
|
||||
"llm(\"The first man on the moon was ...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "343ab645",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Stream tokens as they are being generated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "9cd83603",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" The first man to walk on the moon was Neil Armstrong, an American astronaut who was part of the Apollo 11 mission in 1969. февруари 20, 1969, Armstrong stepped out of the lunar module Eagle and onto the moon's surface, famously declaring \"That's one small step for man, one giant leap for mankind\" as he took his first steps. He was followed by fellow astronaut Edwin \"Buzz\" Aldrin, who also walked on the moon during the mission."
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' The first man to walk on the moon was Neil Armstrong, an American astronaut who was part of the Apollo 11 mission in 1969. февруари 20, 1969, Armstrong stepped out of the lunar module Eagle and onto the moon\\'s surface, famously declaring \"That\\'s one small step for man, one giant leap for mankind\" as he took his first steps. He was followed by fellow astronaut Edwin \"Buzz\" Aldrin, who also walked on the moon during the mission.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.callbacks.manager import CallbackManager\n",
|
||||
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler \n",
|
||||
"llm = Ollama(model=\"llama2\", \n",
|
||||
" callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))\n",
|
||||
"llm(\"The first man on the moon was ...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5cb27414",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment\n",
|
||||
"\n",
|
||||
"Inference speed is a chllenge when running models locally (see above).\n",
|
||||
"\n",
|
||||
"To minimize latency, it is desiable to run models locally on GPU, which ships with many consumer laptops [e.g., Apple devices](https://www.apple.com/newsroom/2022/06/apple-unveils-m2-with-breakthrough-performance-and-capabilities/).\n",
|
||||
"\n",
|
||||
"And even with GPU, the available GPU memory bandwidth (as noted above) is important.\n",
|
||||
"\n",
|
||||
"### Running Apple silicon GPU\n",
|
||||
"\n",
|
||||
"`Ollama` will automatically utilize the GPU on Apple devices.\n",
|
||||
" \n",
|
||||
"Other frameworks require the user to set up the environment to utilize the Apple GPU.\n",
|
||||
"\n",
|
||||
"For example, `llama.cpp` python bindings can be configured to use the GPU via [Metal](https://developer.apple.com/metal/).\n",
|
||||
"\n",
|
||||
"Metal is a graphics and compute API created by Apple providing near-direct access to the GPU. \n",
|
||||
"\n",
|
||||
"See the [`llama.cpp`](docs/integrations/llms/llamacpp) setup [here](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md) to enable this.\n",
|
||||
"\n",
|
||||
"In particular, ensure that conda is using the correct virtual enviorment that you created (`miniforge3`).\n",
|
||||
"\n",
|
||||
"E.g., for me:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"conda activate /Users/rlm/miniforge3/envs/llama\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"With the above confirmed, then:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c382e79a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## LLMs\n",
|
||||
"\n",
|
||||
"There are various ways to gain access to quantized model weights.\n",
|
||||
"\n",
|
||||
"1. [`HuggingFace`](https://huggingface.co/TheBloke) - Many quantized model are available for download and can be run with framework such as [`llama.cpp`](https://github.com/ggerganov/llama.cpp)\n",
|
||||
"2. [`gpt4all`](https://gpt4all.io/index.html) - The model explorer offers a leaderboard of metrics and associated quantized models available for download \n",
|
||||
"3. [`Ollama`](https://github.com/jmorganca/ollama) - Several models can be accessed directly via `pull`\n",
|
||||
"\n",
|
||||
"### Ollama\n",
|
||||
"\n",
|
||||
"With [Ollama](docs/integrations/llms/ollama), fetch a model via `ollama pull <model family>:<tag>`:\n",
|
||||
"\n",
|
||||
"* E.g., for Llama-7b: `ollama pull llama2` will download the most basic version of the model (e.g., smallest # parameters and 4 bit quantization)\n",
|
||||
"* We can also specify a particular version from the [model list](https://github.com/jmorganca/ollama), e.g., `ollama pull llama2:13b`\n",
|
||||
"* See the full set of parameters on the [API reference page](https://api.python.langchain.com/en/latest/llms/langchain.llms.ollama.Ollama.html)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "8ecd2f78",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' Sure! Here\\'s the answer, broken down step by step:\\n\\nThe first man on the moon was... Neil Armstrong.\\n\\nHere\\'s how I arrived at that answer:\\n\\n1. The first manned mission to land on the moon was Apollo 11.\\n2. The mission included three astronauts: Neil Armstrong, Edwin \"Buzz\" Aldrin, and Michael Collins.\\n3. Neil Armstrong was the mission commander and the first person to set foot on the moon.\\n4. On July 20, 1969, Armstrong stepped out of the lunar module Eagle and onto the moon\\'s surface, famously declaring \"That\\'s one small step for man, one giant leap for mankind.\"\\n\\nSo, the first man on the moon was Neil Armstrong!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.llms import Ollama\n",
|
||||
"llm = Ollama(model=\"llama2:13b\")\n",
|
||||
"llm(\"The first man on the moon was ... think step by step\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07c8c0d1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Llama.cpp\n",
|
||||
"\n",
|
||||
"Llama.cpp is compatible with a [broad set of models](https://github.com/ggerganov/llama.cpp).\n",
|
||||
"\n",
|
||||
"For example, below we run inference on `llama2-13b` with 4 bit quantization downloaded from [HuggingFace](https://huggingface.co/TheBloke/Llama-2-13B-GGML/tree/main).\n",
|
||||
"\n",
|
||||
"As noted above, see the [API reference](https://api.python.langchain.com/en/latest/llms/langchain.llms.llamacpp.LlamaCpp.html?highlight=llamacpp#langchain.llms.llamacpp.LlamaCpp) for the full set of parameters. \n",
|
||||
"\n",
|
||||
"From the [llama.cpp docs](https://python.langchain.com/docs/integrations/llms/llamacpp), a few are worth commenting on:\n",
|
||||
"\n",
|
||||
"`n_gpu_layers`: number of layers to be loaded into GPU memory\n",
|
||||
"\n",
|
||||
"* Value: 1\n",
|
||||
"* Meaning: Only one layer of the model will be loaded into GPU memory (1 is often sufficient).\n",
|
||||
"\n",
|
||||
"`n_batch`: number of tokens the model should process in parallel \n",
|
||||
"* Value: n_batch\n",
|
||||
"* Meaning: It's recommended to choose a value between 1 and n_ctx (which in this case is set to 2048)\n",
|
||||
"\n",
|
||||
"`n_ctx`: Token context window .\n",
|
||||
"* Value: 2048\n",
|
||||
"* Meaning: The model will consider a window of 2048 tokens at a time\n",
|
||||
"\n",
|
||||
"`f16_kv`: whether the model should use half-precision for the key/value cache\n",
|
||||
"* Value: True\n",
|
||||
"* Meaning: The model will use half-precision, which can be more memory efficient; Metal only support True."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5eba38dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install llama-cpp-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "9d5f94b5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"objc[10142]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2a0c4c208) and /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/libllama.dylib (0x2c28bc208). One of the two will be used. Which one is undefined.\n",
|
||||
"llama.cpp: loading model from /Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
|
||||
"llama_model_load_internal: format = ggjt v3 (latest)\n",
|
||||
"llama_model_load_internal: n_vocab = 32000\n",
|
||||
"llama_model_load_internal: n_ctx = 2048\n",
|
||||
"llama_model_load_internal: n_embd = 5120\n",
|
||||
"llama_model_load_internal: n_mult = 256\n",
|
||||
"llama_model_load_internal: n_head = 40\n",
|
||||
"llama_model_load_internal: n_layer = 40\n",
|
||||
"llama_model_load_internal: n_rot = 128\n",
|
||||
"llama_model_load_internal: freq_base = 10000.0\n",
|
||||
"llama_model_load_internal: freq_scale = 1\n",
|
||||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
|
||||
"llama_model_load_internal: n_ff = 13824\n",
|
||||
"llama_model_load_internal: model size = 13B\n",
|
||||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
|
||||
"llama_model_load_internal: mem required = 8953.71 MB (+ 1608.00 MB per state)\n",
|
||||
"llama_new_context_with_model: kv self size = 1600.00 MB\n",
|
||||
"ggml_metal_init: allocating\n",
|
||||
"ggml_metal_init: using MPS\n",
|
||||
"ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
|
||||
"ggml_metal_init: loaded kernel_add 0x47774af60\n",
|
||||
"ggml_metal_init: loaded kernel_mul 0x47774bc00\n",
|
||||
"ggml_metal_init: loaded kernel_mul_row 0x47774c230\n",
|
||||
"ggml_metal_init: loaded kernel_scale 0x47774c890\n",
|
||||
"ggml_metal_init: loaded kernel_silu 0x47774cef0\n",
|
||||
"ggml_metal_init: loaded kernel_relu 0x10e33e500\n",
|
||||
"ggml_metal_init: loaded kernel_gelu 0x47774b2f0\n",
|
||||
"ggml_metal_init: loaded kernel_soft_max 0x47771a580\n",
|
||||
"ggml_metal_init: loaded kernel_diag_mask_inf 0x47774dab0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_f16 0x47774e110\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_0 0x47774e7d0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_1 0x13efd7170\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q2_K 0x13efd73d0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q3_K 0x13efd7630\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_K 0x13efd7890\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q5_K 0x4744c9740\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q6_K 0x4744ca6b0\n",
|
||||
"ggml_metal_init: loaded kernel_rms_norm 0x4744cb250\n",
|
||||
"ggml_metal_init: loaded kernel_norm 0x4744cb970\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x10e33f700\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x10e33fcd0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x4744cc2d0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q2_K_f32 0x4744cc6f0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q3_K_f32 0x4744cd6b0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_K_f32 0x4744cde20\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q5_K_f32 0x10e33ff30\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q6_K_f32 0x10e340190\n",
|
||||
"ggml_metal_init: loaded kernel_rope 0x10e3403f0\n",
|
||||
"ggml_metal_init: loaded kernel_alibi_f32 0x10e340de0\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f16 0x10e3416d0\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f32 0x10e342080\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f16_f16 0x10e342ca0\n",
|
||||
"ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
|
||||
"ggml_metal_init: hasUnifiedMemory = true\n",
|
||||
"ggml_metal_init: maxTransferRate = built-in GPU\n",
|
||||
"ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6986.19 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1032.00 MB, ( 8018.19 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9620.19 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 426.00 MB, (10046.19 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10558.19 / 21845.34)\n",
|
||||
"AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.llms import LlamaCpp\n",
|
||||
"llm = LlamaCpp(\n",
|
||||
" model_path=\"/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
|
||||
" n_gpu_layers=1,\n",
|
||||
" n_batch=512,\n",
|
||||
" n_ctx=2048,\n",
|
||||
" f16_kv=True, \n",
|
||||
" callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),\n",
|
||||
" verbose=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f56f5168",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The console log will show the the below to indicate Metal was enabled properly from steps above:\n",
|
||||
"```\n",
|
||||
"ggml_metal_init: allocating\n",
|
||||
"ggml_metal_init: using MPS\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "7890a077",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Llama.generate: prefix-match hit\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" and use logical reasoning to figure out who the first man on the moon was.\n",
|
||||
"\n",
|
||||
"Here are some clues:\n",
|
||||
"\n",
|
||||
"1. The first man on the moon was an American.\n",
|
||||
"2. He was part of the Apollo 11 mission.\n",
|
||||
"3. He stepped out of the lunar module and became the first person to set foot on the moon's surface.\n",
|
||||
"4. His last name is Armstrong.\n",
|
||||
"\n",
|
||||
"Now, let's use our reasoning skills to figure out who the first man on the moon was. Based on clue #1, we know that the first man on the moon was an American. Clue #2 tells us that he was part of the Apollo 11 mission. Clue #3 reveals that he was the first person to set foot on the moon's surface. And finally, clue #4 gives us his last name: Armstrong.\n",
|
||||
"Therefore, the first man on the moon was Neil Armstrong!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"llama_print_timings: load time = 9623.21 ms\n",
|
||||
"llama_print_timings: sample time = 143.77 ms / 203 runs ( 0.71 ms per token, 1412.01 tokens per second)\n",
|
||||
"llama_print_timings: prompt eval time = 485.94 ms / 7 tokens ( 69.42 ms per token, 14.40 tokens per second)\n",
|
||||
"llama_print_timings: eval time = 6385.16 ms / 202 runs ( 31.61 ms per token, 31.64 tokens per second)\n",
|
||||
"llama_print_timings: total time = 7279.28 ms\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\" and use logical reasoning to figure out who the first man on the moon was.\\n\\nHere are some clues:\\n\\n1. The first man on the moon was an American.\\n2. He was part of the Apollo 11 mission.\\n3. He stepped out of the lunar module and became the first person to set foot on the moon's surface.\\n4. His last name is Armstrong.\\n\\nNow, let's use our reasoning skills to figure out who the first man on the moon was. Based on clue #1, we know that the first man on the moon was an American. Clue #2 tells us that he was part of the Apollo 11 mission. Clue #3 reveals that he was the first person to set foot on the moon's surface. And finally, clue #4 gives us his last name: Armstrong.\\nTherefore, the first man on the moon was Neil Armstrong!\""
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm(\"The first man on the moon was ... Let's think step by step\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "831ddf7c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### GPT4All\n",
|
||||
"\n",
|
||||
"We can use model weights downloaded from [GPT4All](https://python.langchain.com/docs/integrations/llms/gpt4all) model explorer.\n",
|
||||
"\n",
|
||||
"Similar to what is shown above, we can run inference and use [the API reference](https://api.python.langchain.com/en/latest/llms/langchain.llms.gpt4all.GPT4All.html?highlight=gpt4all#langchain.llms.gpt4all.GPT4All) to set parameters of interest."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e27baf6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install gpt4all"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "b55a2147",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found model file at /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
|
||||
"llama_new_context_with_model: max tensor size = 87.89 MB\n",
|
||||
"llama_new_context_with_model: max tensor size = 87.89 MB\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"llama.cpp: using Metal\n",
|
||||
"llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
|
||||
"llama_model_load_internal: format = ggjt v3 (latest)\n",
|
||||
"llama_model_load_internal: n_vocab = 32001\n",
|
||||
"llama_model_load_internal: n_ctx = 2048\n",
|
||||
"llama_model_load_internal: n_embd = 5120\n",
|
||||
"llama_model_load_internal: n_mult = 256\n",
|
||||
"llama_model_load_internal: n_head = 40\n",
|
||||
"llama_model_load_internal: n_layer = 40\n",
|
||||
"llama_model_load_internal: n_rot = 128\n",
|
||||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
|
||||
"llama_model_load_internal: n_ff = 13824\n",
|
||||
"llama_model_load_internal: n_parts = 1\n",
|
||||
"llama_model_load_internal: model size = 13B\n",
|
||||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
|
||||
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
|
||||
"llama_new_context_with_model: kv self size = 1600.00 MB\n",
|
||||
"ggml_metal_init: allocating\n",
|
||||
"ggml_metal_init: using MPS\n",
|
||||
"ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
|
||||
"ggml_metal_init: loaded kernel_add 0x37944d850\n",
|
||||
"ggml_metal_init: loaded kernel_mul 0x37944f350\n",
|
||||
"ggml_metal_init: loaded kernel_mul_row 0x37944fdd0\n",
|
||||
"ggml_metal_init: loaded kernel_scale 0x3794505a0\n",
|
||||
"ggml_metal_init: loaded kernel_silu 0x379450800\n",
|
||||
"ggml_metal_init: loaded kernel_relu 0x379450a60\n",
|
||||
"ggml_metal_init: loaded kernel_gelu 0x379450cc0\n",
|
||||
"ggml_metal_init: loaded kernel_soft_max 0x379450ff0\n",
|
||||
"ggml_metal_init: loaded kernel_diag_mask_inf 0x379451250\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_f16 0x3794514b0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_0 0x379451710\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_1 0x379451970\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q2_k 0x379451bd0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q3_k 0x379451e30\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_k 0x379452090\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q5_k 0x3794522f0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q6_k 0x379452550\n",
|
||||
"ggml_metal_init: loaded kernel_rms_norm 0x3794527b0\n",
|
||||
"ggml_metal_init: loaded kernel_norm 0x379452a10\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x379452c70\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x379452ed0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x379453130\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q2_k_f32 0x379453390\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q3_k_f32 0x3794535f0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_k_f32 0x379453850\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q5_k_f32 0x379453ab0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q6_k_f32 0x379453d10\n",
|
||||
"ggml_metal_init: loaded kernel_rope 0x379453f70\n",
|
||||
"ggml_metal_init: loaded kernel_alibi_f32 0x3794541d0\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f16 0x379454430\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f32 0x379454690\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f16_f16 0x3794548f0\n",
|
||||
"ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
|
||||
"ggml_metal_init: hasUnifiedMemory = true\n",
|
||||
"ggml_metal_init: maxTransferRate = built-in GPU\n",
|
||||
"ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, (17542.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1024.00 MB, (18566.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, (20168.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB, (20680.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (21192.94 / 21845.34)\n",
|
||||
"ggml_metal_free: deallocating\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.llms import GPT4All\n",
|
||||
"llm = GPT4All(model=\"/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "e3d4526f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\".\\n1) The United States decides to send a manned mission to the moon.2) They choose their best astronauts and train them for this specific mission.3) They build a spacecraft that can take humans to the moon, called the Lunar Module (LM).4) They also create a larger spacecraft, called the Saturn V rocket, which will launch both the LM and the Command Service Module (CSM), which will carry the astronauts into orbit.5) The mission is planned down to the smallest detail: from the trajectory of the rockets to the exact movements of the astronauts during their moon landing.6) On July 16, 1969, the Saturn V rocket launches from Kennedy Space Center in Florida, carrying the Apollo 11 mission crew into space.7) After one and a half orbits around the Earth, the LM separates from the CSM and begins its descent to the moon's surface.8) On July 20, 1969, at 2:56 pm EDT (GMT-4), Neil Armstrong becomes the first man on the moon. He speaks these\""
|
||||
]
|
||||
},
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm(\"The first man on the moon was ... Let's think step by step\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b84e543",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prompts\n",
|
||||
"\n",
|
||||
"Some LLMs will benefit from specific prompts.\n",
|
||||
"\n",
|
||||
"For example, llama2 can use [special tokens](https://twitter.com/RLanceMartin/status/1681879318493003776?s=20).\n",
|
||||
"\n",
|
||||
"We can use `ConditionalPromptSelector` to set prompt based on the model type."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "d082b10a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"llama.cpp: loading model from /Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
|
||||
"llama_model_load_internal: format = ggjt v3 (latest)\n",
|
||||
"llama_model_load_internal: n_vocab = 32000\n",
|
||||
"llama_model_load_internal: n_ctx = 2048\n",
|
||||
"llama_model_load_internal: n_embd = 5120\n",
|
||||
"llama_model_load_internal: n_mult = 256\n",
|
||||
"llama_model_load_internal: n_head = 40\n",
|
||||
"llama_model_load_internal: n_layer = 40\n",
|
||||
"llama_model_load_internal: n_rot = 128\n",
|
||||
"llama_model_load_internal: freq_base = 10000.0\n",
|
||||
"llama_model_load_internal: freq_scale = 1\n",
|
||||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
|
||||
"llama_model_load_internal: n_ff = 13824\n",
|
||||
"llama_model_load_internal: model size = 13B\n",
|
||||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
|
||||
"llama_model_load_internal: mem required = 8953.71 MB (+ 1608.00 MB per state)\n",
|
||||
"llama_new_context_with_model: kv self size = 1600.00 MB\n",
|
||||
"ggml_metal_init: allocating\n",
|
||||
"ggml_metal_init: using MPS\n",
|
||||
"ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
|
||||
"ggml_metal_init: loaded kernel_add 0x4744d09d0\n",
|
||||
"ggml_metal_init: loaded kernel_mul 0x3781cb3d0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_row 0x37813bb60\n",
|
||||
"ggml_metal_init: loaded kernel_scale 0x474481080\n",
|
||||
"ggml_metal_init: loaded kernel_silu 0x4744d29f0\n",
|
||||
"ggml_metal_init: loaded kernel_relu 0x3781254c0\n",
|
||||
"ggml_metal_init: loaded kernel_gelu 0x47447f280\n",
|
||||
"ggml_metal_init: loaded kernel_soft_max 0x4744cf470\n",
|
||||
"ggml_metal_init: loaded kernel_diag_mask_inf 0x4744cf6d0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_f16 0x4744cf930\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_0 0x4744cfb90\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_1 0x4744cfdf0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q2_K 0x4744d0050\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q3_K 0x4744ce980\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_K 0x4744cebe0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q5_K 0x4744cee40\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q6_K 0x4744cf0a0\n",
|
||||
"ggml_metal_init: loaded kernel_rms_norm 0x474482450\n",
|
||||
"ggml_metal_init: loaded kernel_norm 0x4744826b0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x474482910\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x474482b70\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x474482dd0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q2_K_f32 0x474483030\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q3_K_f32 0x474483290\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_K_f32 0x4744834f0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q5_K_f32 0x474483750\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q6_K_f32 0x4744839b0\n",
|
||||
"ggml_metal_init: loaded kernel_rope 0x474483c10\n",
|
||||
"ggml_metal_init: loaded kernel_alibi_f32 0x474483e70\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f16 0x4744840d0\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f32 0x474484330\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f16_f16 0x474484590\n",
|
||||
"ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
|
||||
"ggml_metal_init: hasUnifiedMemory = true\n",
|
||||
"ggml_metal_init: maxTransferRate = built-in GPU\n",
|
||||
"ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6986.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1032.00 MB, ( 8018.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9620.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 426.00 MB, (10046.94 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10558.94 / 21845.34)\n",
|
||||
"AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Set our LLM\n",
|
||||
"llm = LlamaCpp(\n",
|
||||
" model_path=\"/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
|
||||
" n_gpu_layers=1,\n",
|
||||
" n_batch=512,\n",
|
||||
" n_ctx=2048,\n",
|
||||
" f16_kv=True, \n",
|
||||
" callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),\n",
|
||||
" verbose=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66656084",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Set the associated prompt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "8555f5bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"PromptTemplate(input_variables=['question'], output_parser=None, partial_variables={}, template='<<SYS>> \\n You are an assistant tasked with improving Google search results. \\n <</SYS>> \\n\\n [INST] Generate THREE Google search queries that are similar to this question. The output should be a numbered list of questions and each should have a question mark at the end: \\n\\n {question} [/INST]', template_format='f-string', validate_template=True)"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain import PromptTemplate, LLMChain\n",
|
||||
"from langchain.chains.prompt_selector import ConditionalPromptSelector\n",
|
||||
"\n",
|
||||
"DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(\n",
|
||||
" input_variables=[\"question\"],\n",
|
||||
" template=\"\"\"<<SYS>> \\n You are an assistant tasked with improving Google search \\\n",
|
||||
"results. \\n <</SYS>> \\n\\n [INST] Generate THREE Google search queries that \\\n",
|
||||
"are similar to this question. The output should be a numbered list of questions \\\n",
|
||||
"and each should have a question mark at the end: \\n\\n {question} [/INST]\"\"\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"DEFAULT_SEARCH_PROMPT = PromptTemplate(\n",
|
||||
" input_variables=[\"question\"],\n",
|
||||
" template=\"\"\"You are an assistant tasked with improving Google search \\\n",
|
||||
"results. Generate THREE Google search queries that are similar to \\\n",
|
||||
"this question. The output should be a numbered list of questions and each \\\n",
|
||||
"should have a question mark at the end: {question}\"\"\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(\n",
|
||||
" default_prompt=DEFAULT_SEARCH_PROMPT,\n",
|
||||
" conditionals=[\n",
|
||||
" (lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)\n",
|
||||
"prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"id": "d0aedfd2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Sure! Here are three similar search queries with a question mark at the end:\n",
|
||||
"\n",
|
||||
"1. Which NBA team did LeBron James lead to a championship in the year he was drafted?\n",
|
||||
"2. Who won the Grammy Awards for Best New Artist and Best Female Pop Vocal Performance in the same year that Lady Gaga was born?\n",
|
||||
"3. What MLB team did Babe Ruth play for when he hit 60 home runs in a single season?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"llama_print_timings: load time = 14943.19 ms\n",
|
||||
"llama_print_timings: sample time = 72.93 ms / 101 runs ( 0.72 ms per token, 1384.87 tokens per second)\n",
|
||||
"llama_print_timings: prompt eval time = 14942.95 ms / 93 tokens ( 160.68 ms per token, 6.22 tokens per second)\n",
|
||||
"llama_print_timings: eval time = 3430.85 ms / 100 runs ( 34.31 ms per token, 29.15 tokens per second)\n",
|
||||
"llama_print_timings: total time = 18578.26 ms\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' Sure! Here are three similar search queries with a question mark at the end:\\n\\n1. Which NBA team did LeBron James lead to a championship in the year he was drafted?\\n2. Who won the Grammy Awards for Best New Artist and Best Female Pop Vocal Performance in the same year that Lady Gaga was born?\\n3. What MLB team did Babe Ruth play for when he hit 60 home runs in a single season?'"
|
||||
]
|
||||
},
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Chain\n",
|
||||
"llm_chain = LLMChain(prompt=prompt,llm=llm)\n",
|
||||
"question = \"What NFL team won the Super Bowl in the year that Justin Bieber was born?\"\n",
|
||||
"llm_chain.run({\"question\":question})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6ba66260",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Use cases\n",
|
||||
"\n",
|
||||
"Given an `llm` created from one of the models above, you can use it for [many use cases](docs/use_cases).\n",
|
||||
"\n",
|
||||
"For example, here is a guide to [RAG](docs/use_cases/question_answering/how_to/local_retrieval_qa) with local LLMs.\n",
|
||||
"\n",
|
||||
"In general, use cases for local model can be driven by at least two factors:\n",
|
||||
"\n",
|
||||
"* `Privacy`: private data (e.g., journals, etc) that a user does not want to share \n",
|
||||
"* `Cost`: text preprocessing (extraction/tagging), summarization, and agent simulations are token-use-intensive tasks\n",
|
||||
"\n",
|
||||
"There are a few approach to support specific use-cases: \n",
|
||||
"\n",
|
||||
"* Fine-tuning (e.g., [gpt-llm-trainer](https://github.com/mshumer/gpt-llm-trainer), [Anyscale](https://www.anyscale.com/blog/fine-tuning-llama-2-a-comprehensive-case-study-for-tailoring-models-to-unique-applications)) \n",
|
||||
"* [Function-calling](https://github.com/MeetKai/functionary/tree/main) for use-cases like extraction or tagging\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,107 +0,0 @@
|
||||
# Pydantic Compatibility
|
||||
|
||||
- Pydantic v2 was released in June, 2023 (https://docs.pydantic.dev/2.0/blog/pydantic-v2-final/)
|
||||
- v2 contains has a number of breaking changes (https://docs.pydantic.dev/2.0/migration/)
|
||||
- Pydantic v2 and v1 are under the same package name, so both versions cannot be installed at the same time
|
||||
|
||||
|
||||
## LangChain Pydantic Migration Plan
|
||||
|
||||
Langchain will carry out the migration to pydantic v2 in two steps:
|
||||
|
||||
1. 2023-08-17: LangChain will allow users to install either Pydantic V1 or V2.
|
||||
* Internally LangChain will continue to [use V1](https://docs.pydantic.dev/latest/migration/#continue-using-pydantic-v1-features).
|
||||
* During this time, users can pin their pydantic version to v1 to avoid breaking changes, or start a partial
|
||||
migration using pydantic v2 throughout their code, but avoiding mixing v1 and v2 code for LangChain (see below).
|
||||
|
||||
2. 2023-08-25: Langchain will migrate internally to using V2 code.
|
||||
* Users will have to upgrade to V2 as well to use LangChain.
|
||||
* Users should stop using the `pydantic.v1` namespace when using LangChain.
|
||||
* See the [bump-pydantic package](https://github.com/pydantic/bump-pydantic) to help with the upgrade process.
|
||||
|
||||
## Between 2023-08-17 and 2023-08-25 releases
|
||||
|
||||
User can either pin to pydantic v1, and upgrade their code in one go once LangChain has migrated to v2 internally, or they can start a partial migration to v2, but must avoid mixing v1 and v2 code for LangChain.
|
||||
|
||||
Below are two examples of showing how to avoid mixing pydantic v1 and v2 code in
|
||||
the case of inheritance and in the case of passing objects to LangChain.
|
||||
|
||||
**Example 1: Extending via inheritance**
|
||||
|
||||
**YES**
|
||||
|
||||
```python
|
||||
from pydantic.v1 import root_validator, validator
|
||||
|
||||
class CustomTool(BaseTool): # BaseTool is v1 code
|
||||
x: int = Field(default=1)
|
||||
|
||||
def _run(*args, **kwargs):
|
||||
return "hello"
|
||||
|
||||
@validator('x') # v1 code
|
||||
@classmethod
|
||||
def validate_x(cls, x: int) -> int:
|
||||
return 1
|
||||
|
||||
|
||||
CustomTool(
|
||||
name='custom_tool',
|
||||
description="hello",
|
||||
x=1,
|
||||
)
|
||||
```
|
||||
|
||||
Mixing Pydantic v2 primitives with Pydantic v1 primitives can raise cryptic errors
|
||||
|
||||
**NO**
|
||||
|
||||
```python
|
||||
from pydantic import Field, field_validator # pydantic v2
|
||||
|
||||
class CustomTool(BaseTool): # BaseTool is v1 code
|
||||
x: int = Field(default=1)
|
||||
|
||||
def _run(*args, **kwargs):
|
||||
return "hello"
|
||||
|
||||
@field_validator('x') # v2 code
|
||||
@classmethod
|
||||
def validate_x(cls, x: int) -> int:
|
||||
return 1
|
||||
|
||||
|
||||
CustomTool(
|
||||
name='custom_tool',
|
||||
description="hello",
|
||||
x=1,
|
||||
)
|
||||
```
|
||||
|
||||
**Example 2: Passing objects to LangChain**
|
||||
|
||||
**YES**
|
||||
|
||||
```python
|
||||
from langchain.tools.base import Tool
|
||||
from pydantic.v1 import BaseModel, Field # <-- Uses v1 namespace
|
||||
|
||||
class CalculatorInput(BaseModel):
|
||||
question: str = Field()
|
||||
|
||||
Tool.from_function( # <-- tool uses v1 namespace
|
||||
func=lambda question: 'hello',
|
||||
name="Calculator",
|
||||
description="useful for when you need to answer questions about math",
|
||||
args_schema=CalculatorInput
|
||||
)
|
||||
```
|
||||
|
||||
**NO**
|
||||
|
||||
|
||||
## After 2023-08-25 release
|
||||
|
||||
* Users must upgrade to v2
|
||||
* Users should not pass `pydantic.v1` derived objects to LangChain or rely on `pydantic.v1` when extending functionality
|
||||
|
||||
@@ -147,7 +147,7 @@
|
||||
" api_key=os.environ[\"ARGILLA_API_KEY\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"dataset.push_to_argilla(\"langchain-dataset\");"
|
||||
"dataset.push_to_argilla(\"langchain-dataset\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -7,12 +7,12 @@
|
||||
"source": [
|
||||
"# Context\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"[Context](https://context.ai/) provides user analytics for LLM powered products and features.\n",
|
||||
"[Context](https://getcontext.ai/) provides product analytics for AI chatbots.\n",
|
||||
"\n",
|
||||
"With Context, you can start understanding your users and improving their experiences in less than 30 minutes.\n",
|
||||
"\n"
|
||||
"Context helps you understand how users are interacting with your AI chat products.\n",
|
||||
"Gain critical insights, optimise poor experiences, and minimise brand risks.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -55,7 +55,7 @@
|
||||
"\n",
|
||||
"To get your Context API token:\n",
|
||||
"\n",
|
||||
"1. Go to the settings page within your Context account (https://with.context.ai/settings).\n",
|
||||
"1. Go to the settings page within your Context account (https://go.getcontext.ai/settings).\n",
|
||||
"2. Generate a new API Token.\n",
|
||||
"3. Store this token somewhere secure."
|
||||
]
|
||||
@@ -207,7 +207,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.11.3"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
@@ -1,382 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Label Studio\n",
|
||||
"\n",
|
||||
"<div>\n",
|
||||
"<img src=\"https://labelstudio-pub.s3.amazonaws.com/lc/open-source-data-labeling-platform.png\" width=\"400\"/>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"Label Studio is an open-source data labeling platform that provides LangChain with flexibility when it comes to labeling data for fine-tuning large language models (LLMs). It also enables the preparation of custom training data and the collection and evaluation of responses through human feedback.\n",
|
||||
"\n",
|
||||
"In this guide, you will learn how to connect a LangChain pipeline to Label Studio to:\n",
|
||||
"\n",
|
||||
"- Aggregate all input prompts, conversations, and responses in a single LabelStudio project. This consolidates all the data in one place for easier labeling and analysis.\n",
|
||||
"- Refine prompts and responses to create a dataset for supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) scenarios. The labeled data can be used to further train the LLM to improve its performance.\n",
|
||||
"- Evaluate model responses through human feedback. LabelStudio provides an interface for humans to review and provide feedback on model responses, allowing evaluation and iteration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Installation and setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"First install latest versions of Label Studio and Label Studio API client:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -U label-studio label-studio-sdk openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"Next, run `label-studio` on the command line to start the local LabelStudio instance at `http://localhost:8080`. See the [Label Studio installation guide](https://labelstud.io/guide/install) for more options."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You'll need a token to make API calls.\n",
|
||||
"\n",
|
||||
"Open your LabelStudio instance in your browser, go to `Account & Settings > Access Token` and copy the key.\n",
|
||||
"\n",
|
||||
"Set environment variables with your LabelStudio URL, API key and OpenAI API key:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ['LABEL_STUDIO_URL'] = '<YOUR-LABEL-STUDIO-URL>' # e.g. http://localhost:8080\n",
|
||||
"os.environ['LABEL_STUDIO_API_KEY'] = '<YOUR-LABEL-STUDIO-API-KEY>'\n",
|
||||
"os.environ['OPENAI_API_KEY'] = '<YOUR-OPENAI-API-KEY>'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Collecting LLMs prompts and responses"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The data used for labeling is stored in projects within Label Studio. Every project is identified by an XML configuration that details the specifications for input and output data. \n",
|
||||
"\n",
|
||||
"Create a project that takes human input in text format and outputs an editable LLM response in a text area:\n",
|
||||
"\n",
|
||||
"```xml\n",
|
||||
"<View>\n",
|
||||
"<Style>\n",
|
||||
" .prompt-box {\n",
|
||||
" background-color: white;\n",
|
||||
" border-radius: 10px;\n",
|
||||
" box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);\n",
|
||||
" padding: 20px;\n",
|
||||
" }\n",
|
||||
"</Style>\n",
|
||||
"<View className=\"root\">\n",
|
||||
" <View className=\"prompt-box\">\n",
|
||||
" <Text name=\"prompt\" value=\"$prompt\"/>\n",
|
||||
" </View>\n",
|
||||
" <TextArea name=\"response\" toName=\"prompt\"\n",
|
||||
" maxSubmissions=\"1\" editable=\"true\"\n",
|
||||
" required=\"true\"/>\n",
|
||||
"</View>\n",
|
||||
"<Header value=\"Rate the response:\"/>\n",
|
||||
"<Rating name=\"rating\" toName=\"prompt\"/>\n",
|
||||
"</View>\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"1. To create a project in Label Studio, click on the \"Create\" button. \n",
|
||||
"2. Enter a name for your project in the \"Project Name\" field, such as `My Project`.\n",
|
||||
"3. Navigate to `Labeling Setup > Custom Template` and paste the XML configuration provided above."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You can collect input LLM prompts and output responses in a LabelStudio project, connecting it via `LabelStudioCallbackHandler`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.callbacks import LabelStudioCallbackHandler\n",
|
||||
"\n",
|
||||
"llm = OpenAI(\n",
|
||||
" temperature=0,\n",
|
||||
" callbacks=[\n",
|
||||
" LabelStudioCallbackHandler(\n",
|
||||
" project_name=\"My Project\"\n",
|
||||
" )]\n",
|
||||
")\n",
|
||||
"print(llm(\"Tell me a joke\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"In the Label Studio, open `My Project`. You will see the prompts, responses, and metadata like the model name. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Collecting Chat model Dialogues"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also track and display full chat dialogues in LabelStudio, with the ability to rate and modify the last response:\n",
|
||||
"\n",
|
||||
"1. Open Label Studio and click on the \"Create\" button.\n",
|
||||
"2. Enter a name for your project in the \"Project Name\" field, such as `New Project with Chat`.\n",
|
||||
"3. Navigate to Labeling Setup > Custom Template and paste the following XML configuration:\n",
|
||||
"\n",
|
||||
"```xml\n",
|
||||
"<View>\n",
|
||||
"<View className=\"root\">\n",
|
||||
" <Paragraphs name=\"dialogue\"\n",
|
||||
" value=\"$prompt\"\n",
|
||||
" layout=\"dialogue\"\n",
|
||||
" textKey=\"content\"\n",
|
||||
" nameKey=\"role\"\n",
|
||||
" granularity=\"sentence\"/>\n",
|
||||
" <Header value=\"Final response:\"/>\n",
|
||||
" <TextArea name=\"response\" toName=\"dialogue\"\n",
|
||||
" maxSubmissions=\"1\" editable=\"true\"\n",
|
||||
" required=\"true\"/>\n",
|
||||
"</View>\n",
|
||||
"<Header value=\"Rate the response:\"/>\n",
|
||||
"<Rating name=\"rating\" toName=\"dialogue\"/>\n",
|
||||
"</View>\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.schema import HumanMessage, SystemMessage\n",
|
||||
"from langchain.callbacks import LabelStudioCallbackHandler\n",
|
||||
"\n",
|
||||
"chat_llm = ChatOpenAI(callbacks=[\n",
|
||||
" LabelStudioCallbackHandler(\n",
|
||||
" mode=\"chat\",\n",
|
||||
" project_name=\"New Project with Chat\",\n",
|
||||
" )\n",
|
||||
"])\n",
|
||||
"llm_results = chat_llm([\n",
|
||||
" SystemMessage(content=\"Always use a lot of emojis\"),\n",
|
||||
" HumanMessage(content=\"Tell me a joke\")\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In Label Studio, open \"New Project with Chat\". Click on a created task to view dialog history and edit/annotate responses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Custom Labeling Configuration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You can modify the default labeling configuration in LabelStudio to add more target labels like response sentiment, relevance, and many [other types annotator's feedback](https://labelstud.io/tags/).\n",
|
||||
"\n",
|
||||
"New labeling configuration can be added from UI: go to `Settings > Labeling Interface` and set up a custom configuration with additional tags like `Choices` for sentiment or `Rating` for relevance. Keep in mind that [`TextArea` tag](https://labelstud.io/tags/textarea) should be presented in any configuration to display the LLM responses.\n",
|
||||
"\n",
|
||||
"Alternatively, you can specify the labeling configuration on the initial call before project creation:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ls = LabelStudioCallbackHandler(project_config='''\n",
|
||||
"<View>\n",
|
||||
"<Text name=\"prompt\" value=\"$prompt\"/>\n",
|
||||
"<TextArea name=\"response\" toName=\"prompt\"/>\n",
|
||||
"<TextArea name=\"user_feedback\" toName=\"prompt\"/>\n",
|
||||
"<Rating name=\"rating\" toName=\"prompt\"/>\n",
|
||||
"<Choices name=\"sentiment\" toName=\"prompt\">\n",
|
||||
" <Choice value=\"Positive\"/>\n",
|
||||
" <Choice value=\"Negative\"/>\n",
|
||||
"</Choices>\n",
|
||||
"</View>\n",
|
||||
"''')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that if the project doesn't exist, it will be created with the specified labeling configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Other parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"The `LabelStudioCallbackHandler` accepts several optional parameters:\n",
|
||||
"\n",
|
||||
"- **api_key** - Label Studio API key. Overrides environmental variable `LABEL_STUDIO_API_KEY`.\n",
|
||||
"- **url** - Label Studio URL. Overrides `LABEL_STUDIO_URL`, default `http://localhost:8080`.\n",
|
||||
"- **project_id** - Existing Label Studio project ID. Overrides `LABEL_STUDIO_PROJECT_ID`. Stores data in this project.\n",
|
||||
"- **project_name** - Project name if project ID not specified. Creates a new project. Default is `\"LangChain-%Y-%m-%d\"` formatted with the current date.\n",
|
||||
"- **project_config** - [custom labeling configuration](#custom-labeling-configuration)\n",
|
||||
"- **mode**: use this shortcut to create target configuration from scratch:\n",
|
||||
" - `\"prompt\"` - Single prompt, single response. Default.\n",
|
||||
" - `\"chat\"` - Multi-turn chat mode.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "labelops",
|
||||
"language": "python",
|
||||
"name": "labelops"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
@@ -71,6 +71,3 @@ or any other local ENV management tool.
|
||||
|
||||
Currently `StreamlitCallbackHandler` is geared towards use with a LangChain Agent Executor. Support for additional agent types,
|
||||
use directly with Chains, etc will be added in the future.
|
||||
|
||||
You may also be interested in using
|
||||
[StreamlitChatMessageHistory](/docs/integrations/memory/streamlit_chat_message_history) for LangChain.
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "642fd21c-600a-47a1-be96-6e1438b421a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Anyscale\n",
|
||||
"\n",
|
||||
"This notebook demonstrates the use of `langchain.chat_models.ChatAnyscale` for [Anyscale Endpoints](https://endpoints.anyscale.com/).\n",
|
||||
"\n",
|
||||
"* Set `ANYSCALE_API_KEY` environment variable\n",
|
||||
"* or use the `anyscale_api_key` keyword argument"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install openai"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "d00d850917865298"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "72340871-ae2f-415f-b399-0777d32dc379",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ········\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"ANYSCALE_API_KEY\"] = getpass()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5d7fc704-3ea0-4c35-96e7-89fcae6c73fa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's try out each model offered on Anyscale Endpoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "0dc9428d-4217-47d2-97de-f784b1764186",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatAnyscale\n",
|
||||
"\n",
|
||||
"chats = {\n",
|
||||
" model: ChatAnyscale(model_name=model, temperature=1.0)\n",
|
||||
" for model in ChatAnyscale.get_available_models()\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(chats.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7c4f124a-eaf7-4d78-a2c0-b0aa23fb25c4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# We can use async methods and other stuff supported by ChatOpenAI\n",
|
||||
"\n",
|
||||
"This way, the three requests will only take as long as the longest individual request."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "1f94f5d2-569e-4a2c-965e-de53c2845fbb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
"from langchain.schema import SystemMessage, HumanMessage\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" SystemMessage(\n",
|
||||
" content=\"You are a helpful AI that shares everything you know.\"\n",
|
||||
" ),\n",
|
||||
" HumanMessage(\n",
|
||||
" content=\"Tell me technical facts about yourself. Are you a transformer model? How many billions of parameters do you have?\"\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"async def get_msgs():\n",
|
||||
" tasks = [\n",
|
||||
" chat.apredict_messages(messages)\n",
|
||||
" for chat in chats.values()\n",
|
||||
" ]\n",
|
||||
" responses = await asyncio.gather(*tasks)\n",
|
||||
" return dict(zip(chats.keys(), responses))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b2ced871-869a-4ca6-a2ec-6bfececdf7da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "bc605fa5-9501-470d-a6c9-cd868d2145ef",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\tmeta-llama/Llama-2-70b-chat-hf\n",
|
||||
"\n",
|
||||
"Greetings! I'm just an AI, I don't have a personal identity like humans do, but I'm here to help you with any questions you have.\n",
|
||||
"\n",
|
||||
"I'm a large language model, which means I'm trained on a large corpus of text data to generate language outputs that are coherent and natural-sounding. My architecture is based on a transformer model, which is a type of neural network that's particularly well-suited for natural language processing tasks.\n",
|
||||
"\n",
|
||||
"As for my parameters, I have a few billion parameters, but I don't have access to the exact number as it's not relevant to my functioning. My training data includes a vast amount of text from various sources, including books, articles, and websites, which I use to learn patterns and relationships in language.\n",
|
||||
"\n",
|
||||
"I'm designed to be a helpful tool for a variety of tasks, such as answering questions, providing information, and generating text. I'm constantly learning and improving my abilities through machine learning algorithms and feedback from users like you.\n",
|
||||
"\n",
|
||||
"I hope this helps! Is there anything else you'd like to know about me or my capabilities?\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"\tmeta-llama/Llama-2-7b-chat-hf\n",
|
||||
"\n",
|
||||
"Ah, a fellow tech enthusiast! *adjusts glasses* I'm glad to share some technical details about myself. 🤓\n",
|
||||
"Indeed, I'm a transformer model, specifically a BERT-like language model trained on a large corpus of text data. My architecture is based on the transformer framework, which is a type of neural network designed for natural language processing tasks. 🏠\n",
|
||||
"As for the number of parameters, I have approximately 340 million. *winks* That's a pretty hefty number, if I do say so myself! These parameters allow me to learn and represent complex patterns in language, such as syntax, semantics, and more. 🤔\n",
|
||||
"But don't ask me to do math in my head – I'm a language model, not a calculating machine! 😅 My strengths lie in understanding and generating human-like text, so feel free to chat with me anytime you'd like. 💬\n",
|
||||
"Now, do you have any more technical questions for me? Or would you like to engage in a nice chat? 😊\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"\tmeta-llama/Llama-2-13b-chat-hf\n",
|
||||
"\n",
|
||||
"Hello! As a friendly and helpful AI, I'd be happy to share some technical facts about myself.\n",
|
||||
"\n",
|
||||
"I am a transformer-based language model, specifically a variant of the BERT (Bidirectional Encoder Representations from Transformers) architecture. BERT was developed by Google in 2018 and has since become one of the most popular and widely-used AI language models.\n",
|
||||
"\n",
|
||||
"Here are some technical details about my capabilities:\n",
|
||||
"\n",
|
||||
"1. Parameters: I have approximately 340 million parameters, which are the numbers that I use to learn and represent language. This is a relatively large number of parameters compared to some other languages models, but it allows me to learn and understand complex language patterns and relationships.\n",
|
||||
"2. Training: I was trained on a large corpus of text data, including books, articles, and other sources of written content. This training allows me to learn about the structure and conventions of language, as well as the relationships between words and phrases.\n",
|
||||
"3. Architectures: My architecture is based on the transformer model, which is a type of neural network that is particularly well-suited for natural language processing tasks. The transformer model uses self-attention mechanisms to allow the model to \"attend\" to different parts of the input text, allowing it to capture long-range dependencies and contextual relationships.\n",
|
||||
"4. Precision: I am capable of generating text with high precision and accuracy, meaning that I can produce text that is close to human-level quality in terms of grammar, syntax, and coherence.\n",
|
||||
"5. Generative capabilities: In addition to being able to generate text based on prompts and questions, I am also capable of generating text based on a given topic or theme. This allows me to create longer, more coherent pieces of text that are organized around a specific idea or concept.\n",
|
||||
"\n",
|
||||
"Overall, I am a powerful and versatile language model that is capable of a wide range of natural language processing tasks. I am constantly learning and improving, and I am here to help answer any questions you may have!\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"CPU times: user 371 ms, sys: 15.5 ms, total: 387 ms\n",
|
||||
"Wall time: 12 s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"\n",
|
||||
"response_dict = asyncio.run(get_msgs())\n",
|
||||
"\n",
|
||||
"for model_name, response in response_dict.items():\n",
|
||||
" print(f'\\t{model_name}')\n",
|
||||
" print()\n",
|
||||
" print(response.content)\n",
|
||||
" print('\\n---\\n')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -74,124 +74,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f27fa24d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Model Version\n",
|
||||
"Azure OpenAI responses contain `model` property, which is name of the model used to generate the response. However unlike native OpenAI responses, it does not contain the version of the model, which is set on the deplyoment in Azure. This makes it tricky to know which version of the model was used to generate the response, which as result can lead to e.g. wrong total cost calculation with `OpenAICallbackHandler`.\n",
|
||||
"\n",
|
||||
"To solve this problem, you can pass `model_version` parameter to `AzureChatOpenAI` class, which will be added to the model name in the llm output. This way you can easily distinguish between different versions of the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0531798a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.callbacks import get_openai_callback"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "3fd97dfc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BASE_URL = \"https://{endpoint}.openai.azure.com\"\n",
|
||||
"API_KEY = \"...\"\n",
|
||||
"DEPLOYMENT_NAME = \"gpt-35-turbo\" # in Azure, this deployment has version 0613 - input and output tokens are counted separately"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "aceddb72",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total Cost (USD): $0.000054\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = AzureChatOpenAI(\n",
|
||||
" openai_api_base=BASE_URL,\n",
|
||||
" openai_api_version=\"2023-05-15\",\n",
|
||||
" deployment_name=DEPLOYMENT_NAME,\n",
|
||||
" openai_api_key=API_KEY,\n",
|
||||
" openai_api_type=\"azure\",\n",
|
||||
")\n",
|
||||
"with get_openai_callback() as cb:\n",
|
||||
" model(\n",
|
||||
" [\n",
|
||||
" HumanMessage(\n",
|
||||
" content=\"Translate this sentence from English to French. I love programming.\"\n",
|
||||
" )\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" print(f\"Total Cost (USD): ${format(cb.total_cost, '.6f')}\") # without specifying the model version, flat-rate 0.002 USD per 1k input and output tokens is used\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e61eefd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can provide the model version to `AzureChatOpenAI` constructor. It will get appended to the model name returned by Azure OpenAI and cost will be counted correctly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "8d5e54e9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total Cost (USD): $0.000044\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model0613 = AzureChatOpenAI(\n",
|
||||
" openai_api_base=BASE_URL,\n",
|
||||
" openai_api_version=\"2023-05-15\",\n",
|
||||
" deployment_name=DEPLOYMENT_NAME,\n",
|
||||
" openai_api_key=API_KEY,\n",
|
||||
" openai_api_type=\"azure\",\n",
|
||||
" model_version=\"0613\"\n",
|
||||
")\n",
|
||||
"with get_openai_callback() as cb:\n",
|
||||
" model0613(\n",
|
||||
" [\n",
|
||||
" HumanMessage(\n",
|
||||
" content=\"Translate this sentence from English to French. I love programming.\"\n",
|
||||
" )\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" print(f\"Total Cost (USD): ${format(cb.total_cost, '.6f')}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "99682534",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -210,7 +92,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AzureML Chat Online Endpoint\n",
|
||||
"\n",
|
||||
"[AzureML](https://azure.microsoft.com/en-us/products/machine-learning/) is a platform used to build, train, and deploy machine learning models. Users can explore the types of models to deploy in the Model Catalog, which provides Azure Foundation Models and OpenAI Models. Azure Foundation Models include various open-source models and popular Hugging Face models. Users can also import models of their liking into AzureML.\n",
|
||||
"\n",
|
||||
"This notebook goes over how to use a chat model hosted on an `AzureML online endpoint`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models.azureml_endpoint import AzureMLChatOnlineEndpoint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set up\n",
|
||||
"\n",
|
||||
"To use the wrapper, you must [deploy a model on AzureML](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-foundation-models?view=azureml-api-2#deploying-foundation-models-to-endpoints-for-inferencing) and obtain the following parameters:\n",
|
||||
"\n",
|
||||
"* `endpoint_api_key`: The API key provided by the endpoint\n",
|
||||
"* `endpoint_url`: The REST endpoint url provided by the endpoint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Content Formatter\n",
|
||||
"\n",
|
||||
"The `content_formatter` parameter is a handler class for transforming the request and response of an AzureML endpoint to match with required schema. Since there are a wide range of models in the model catalog, each of which may process data differently from one another, a `ContentFormatterBase` class is provided to allow users to transform data to their liking. The following content formatters are provided:\n",
|
||||
"\n",
|
||||
"* `LLamaContentFormatter`: Formats request and response data for LLaMa2-chat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=' The Collatz Conjecture is one of the most famous unsolved problems in mathematics, and it has been the subject of much study and research for many years. While it is impossible to predict with certainty whether the conjecture will ever be solved, there are several reasons why it is considered a challenging and important problem:\\n\\n1. Simple yet elusive: The Collatz Conjecture is a deceptively simple statement that has proven to be extraordinarily difficult to prove or disprove. Despite its simplicity, the conjecture has eluded some of the brightest minds in mathematics, and it remains one of the most famous open problems in the field.\\n2. Wide-ranging implications: The Collatz Conjecture has far-reaching implications for many areas of mathematics, including number theory, algebra, and analysis. A solution to the conjecture could have significant impacts on these fields and potentially lead to new insights and discoveries.\\n3. Computational evidence: While the conjecture remains unproven, extensive computational evidence supports its validity. In fact, no counterexample to the conjecture has been found for any starting value up to 2^64 (a number', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chat_models.azureml_endpoint import LlamaContentFormatter\n",
|
||||
"from langchain.schema import HumanMessage\n",
|
||||
"\n",
|
||||
"chat = AzureMLChatOnlineEndpoint(content_formatter=LlamaContentFormatter())\n",
|
||||
"response = chat(messages=[\n",
|
||||
" HumanMessage(content=\"Will the Collatz conjecture ever be solved?\")\n",
|
||||
"])\n",
|
||||
"response"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ERNIE-Bot Chat\n",
|
||||
"\n",
|
||||
"[ERNIE-Bot](https://cloud.baidu.com/doc/WENXINWORKSHOP/s/jlil56u11) is a large language model developed by Baidu, covering a huge amount of Chinese data.\n",
|
||||
"This notebook covers how to get started with ErnieBot chat models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ErnieBotChat\n",
|
||||
"from langchain.schema import HumanMessage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chat = ErnieBotChat(ernie_client_id='YOUR_CLIENT_ID', ernie_client_secret='YOUR_CLIENT_SECRET')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"or you can set `client_id` and `client_secret` in your environment variables\n",
|
||||
"```bash\n",
|
||||
"export ERNIE_CLIENT_ID=YOUR_CLIENT_ID\n",
|
||||
"export ERNIE_CLIENT_SECRET=YOUR_CLIENT_SECRET\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Hello, I am an artificial intelligence language model. My purpose is to help users answer questions or provide information. What can I do for you?', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chat([\n",
|
||||
" HumanMessage(content='hello there, who are you?')\n",
|
||||
"])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -9,7 +8,11 @@
|
||||
"\n",
|
||||
"Note: This is seperate from the Google PaLM integration. Google has chosen to offer an enterprise version of PaLM through GCP, and this supports the models made available through there. \n",
|
||||
"\n",
|
||||
"By default, Google Cloud [does not use](https://cloud.google.com/vertex-ai/docs/generative-ai/data-governance#foundation_model_development) Customer Data to train its foundation models as part of Google Cloud`s AI/ML Privacy Commitment. More details about how Google processes data can also be found in [Google's Customer Data Processing Addendum (CDPA)](https://cloud.google.com/terms/data-processing-addendum).\n",
|
||||
"PaLM API on Vertex AI is a Preview offering, subject to the Pre-GA Offerings Terms of the [GCP Service Specific Terms](https://cloud.google.com/terms/service-terms). \n",
|
||||
"\n",
|
||||
"Pre-GA products and features may have limited support, and changes to pre-GA products and features may not be compatible with other pre-GA versions. For more information, see the [launch stage descriptions](https://cloud.google.com/products#product-launch-stages). Further, by using PaLM API on Vertex AI, you agree to the Generative AI Preview [terms and conditions](https://cloud.google.com/trustedtester/aitos) (Preview Terms).\n",
|
||||
"\n",
|
||||
"For PaLM API on Vertex AI, you can process personal data as outlined in the Cloud Data Processing Addendum, subject to applicable restrictions and obligations in the Agreement (as defined in the Preview Terms).\n",
|
||||
"\n",
|
||||
"To use Vertex AI PaLM you must have the `google-cloud-aiplatform` Python package installed and either:\n",
|
||||
"- Have credentials configured for your environment (gcloud, workload identity, etc...)\n",
|
||||
@@ -87,7 +90,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -140,7 +142,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "bf733a38-db84-4363-89e2-de6735c37230",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 🚅 LiteLLM\n",
|
||||
"\n",
|
||||
"[LiteLLM](https://github.com/BerriAI/litellm) is a library that simplifies calling Anthropic, Azure, Huggingface, Replicate, etc. \n",
|
||||
"\n",
|
||||
"This notebook covers how to get started with using Langchain + the LiteLLM I/O library. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "d4a7c55d-b235-4ca4-a579-c90cc9570da9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatLiteLLM\n",
|
||||
"from langchain.prompts.chat import (\n",
|
||||
" ChatPromptTemplate,\n",
|
||||
" SystemMessagePromptTemplate,\n",
|
||||
" AIMessagePromptTemplate,\n",
|
||||
" HumanMessagePromptTemplate,\n",
|
||||
")\n",
|
||||
"from langchain.schema import AIMessage, HumanMessage, SystemMessage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "70cf04e8-423a-4ff6-8b09-f11fb711c817",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chat = ChatLiteLLM(model=\"gpt-3.5-turbo\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8199ef8f-eb8b-4253-9ea0-6c24a013ca4c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\" J'aime la programmation.\", additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" HumanMessage(\n",
|
||||
" content=\"Translate this sentence from English to French. I love programming.\"\n",
|
||||
" )\n",
|
||||
"]\n",
|
||||
"chat(messages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "c361ab1e-8c0c-4206-9e3c-9d1424a12b9c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## `ChatLiteLLM` also supports async and streaming functionality:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "93a21c5c-6ef9-4688-be60-b2e1f94842fb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.callbacks.manager import CallbackManager\n",
|
||||
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c5fac0e9-05a4-4fc1-a3b3-e5bbb24b971b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"LLMResult(generations=[[ChatGeneration(text=\" J'aime programmer.\", generation_info=None, message=AIMessage(content=\" J'aime programmer.\", additional_kwargs={}, example=False))]], llm_output={}, run=[RunInfo(run_id=UUID('8cc8fb68-1c35-439c-96a0-695036a93652'))])"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await chat.agenerate([messages])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "025be980-e50d-4a68-93dc-c9c7b500ce34",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" J'aime la programmation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\" J'aime la programmation.\", additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chat = ChatLiteLLM(\n",
|
||||
" streaming=True,\n",
|
||||
" verbose=True,\n",
|
||||
" callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),\n",
|
||||
")\n",
|
||||
"chat(messages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c253883f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,226 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte CDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"A lot of source connectors are implemented using the [Airbyte CDK](https://docs.airbyte.com/connector-development/cdk-python/). This loader allows to run any of these connectors and return the data as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-cdk` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-cdk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "085aa658",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Then, either install an existing connector from the [Airbyte Github repository](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors) or create your own connector using the [Airbyte CDK](https://docs.airbyte.io/connector-development/connector-development).\n",
|
||||
"\n",
|
||||
"For example, to install the Github connector, run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f6d04ef4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install \"source_github@git+https://github.com/airbytehq/airbyte.git@master#subdirectory=airbyte-integrations/connectors/source-github\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36069b74",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Some sources are also published as regular packages on PyPI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can create an `AirbyteCDKLoader` based on the imported source. It takes a `config` object that's passed to the connector. You also have to pick the stream you want to retrieve records from by name (`stream_name`). Check the connectors documentation page and spec definition for more information on the config object and available streams. For the Github connectors these are:\n",
|
||||
"* [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-github/source_github/spec.json](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-github/source_github/spec.json).\n",
|
||||
"* [https://docs.airbyte.com/integrations/sources/github/](https://docs.airbyte.com/integrations/sources/github/)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteCDKLoader\n",
|
||||
"from source_github.source import SourceGithub # plug in your own source here\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your github configuration\n",
|
||||
" \"credentials\": {\n",
|
||||
" \"api_url\": \"api.github.com\",\n",
|
||||
" \"personal_access_token\": \"<token>\"\n",
|
||||
" },\n",
|
||||
" \"repository\": \"<repo>\",\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"issues_loader = AirbyteCDKLoader(source_class=SourceGithub, config=config, stream_name=\"issues\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = issues_loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = issues_loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"] + \"\\n\" + (record.data[\"body\"] or \"\"), metadata=record.data)\n",
|
||||
"\n",
|
||||
"issues_loader = AirbyteCDKLoader(source_class=SourceGithub, config=config, stream_name=\"issues\", record_handler=handle_record)\n",
|
||||
"\n",
|
||||
"docs = issues_loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = issues_loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_issue_loader = AirbyteCDKLoader(source_class=SourceGithub, config=config, stream_name=\"issues\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_issue_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,206 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Gong"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Gong connector as a document loader, allowing you to load various Gong objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-gong` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-gong"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/gong/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-gong/source_gong/spec.yaml](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-gong/source_gong/spec.yaml).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"access_key\": \"<access key name>\",\n",
|
||||
" \"access_key_secret\": \"<access key secret>\",\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteGongLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your gong configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteGongLoader(config=config, stream_name=\"calls\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To process documents, create a class inheriting from the base loader and implement the `_handle_records` method yourself:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteGongLoader(config=config, record_handler=handle_record, stream_name=\"calls\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteGongLoader(config=config, stream_name=\"calls\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,208 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Hubspot"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Hubspot connector as a document loader, allowing you to load various Hubspot objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-hubspot` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-hubspot"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/hubspot/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-hubspot/source_hubspot/spec.yaml](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-hubspot/source_hubspot/spec.yaml).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
" \"credentials\": {\n",
|
||||
" \"credentials_title\": \"Private App Credentials\",\n",
|
||||
" \"access_token\": \"<access token of your private app>\"\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteHubspotLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your hubspot configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteHubspotLoader(config=config, stream_name=\"products\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To process documents, create a class inheriting from the base loader and implement the `_handle_records` method yourself:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteHubspotLoader(config=config, record_handler=handle_record, stream_name=\"products\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteHubspotLoader(config=config, stream_name=\"products\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,213 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Salesforce"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Salesforce connector as a document loader, allowing you to load various Salesforce objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-salesforce` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-salesforce"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/salesforce/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-salesforce/source_salesforce/spec.yaml](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-salesforce/source_salesforce/spec.yaml).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"client_id\": \"<oauth client id>\",\n",
|
||||
" \"client_secret\": \"<oauth client secret>\",\n",
|
||||
" \"refresh_token\": \"<oauth refresh token>\",\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
" \"is_sandbox\": False, # set to True if you're using a sandbox environment\n",
|
||||
" \"streams_criteria\": [ # Array of filters for salesforce objects that should be loadable\n",
|
||||
" {\"criteria\": \"exacts\", \"value\": \"Account\"}, # Exact name of salesforce object\n",
|
||||
" {\"criteria\": \"starts with\", \"value\": \"Asset\"}, # Prefix of the name\n",
|
||||
" # Other allowed criteria: ends with, contains, starts not with, ends not with, not contains, not exacts\n",
|
||||
" ],\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteSalesforceLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your salesforce configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteSalesforceLoader(config=config, stream_name=\"asset\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteSalesforceLoader(config=config, record_handler=handle_record, stream_name=\"asset\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteSalesforceLoader(config=config, stream_name=\"asset\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,209 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Shopify"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Shopify connector as a document loader, allowing you to load various Shopify objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-shopify` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-shopify"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/shopify/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
" \"shop\": \"<name of the shop you want to retrieve documents from>\",\n",
|
||||
" \"credentials\": {\n",
|
||||
" \"auth_method\": \"api_password\",\n",
|
||||
" \"api_password\": \"<your api password>\"\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteShopifyLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your shopify configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteShopifyLoader(config=config, stream_name=\"orders\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteShopifyLoader(config=config, record_handler=handle_record, stream_name=\"orders\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteShopifyLoader(config=config, stream_name=\"orders\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,206 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Stripe"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Stripe connector as a document loader, allowing you to load various Stripe objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-stripe` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-stripe"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/stripe/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-stripe/source_stripe/spec.yaml](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-stripe/source_stripe/spec.yaml).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"client_secret\": \"<secret key>\",\n",
|
||||
" \"account_id\": \"<account id>\",\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteStripeLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your stripe configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteStripeLoader(config=config, stream_name=\"invoices\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteStripeLoader(config=config, record_handler=handle_record, stream_name=\"invoices\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteStripeLoader(config=config, record_handler=handle_record, stream_name=\"invoices\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,209 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Typeform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Typeform connector as a document loader, allowing you to load various Typeform objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-typeform` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-typeform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/typeform/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-typeform/source_typeform/spec.json](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-typeform/source_typeform/spec.json).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"credentials\": {\n",
|
||||
" \"auth_type\": \"Private Token\",\n",
|
||||
" \"access_token\": \"<your auth token>\"\n",
|
||||
" },\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
" \"form_ids\": [\"<id of form to load records for>\"] # if omitted, records from all forms will be loaded\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteTypeformLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your typeform configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteTypeformLoader(config=config, stream_name=\"forms\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteTypeformLoader(config=config, record_handler=handle_record, stream_name=\"forms\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteTypeformLoader(config=config, record_handler=handle_record, stream_name=\"forms\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,210 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte Zendesk Support"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||
"\n",
|
||||
"This loader exposes the Zendesk Support connector as a document loader, allowing you to load various objects as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6847a40c",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b06fbde",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3e9dc79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to install the `airbyte-source-zendesk-support` python package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d35e4e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install airbyte-source-zendesk-support"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae855210",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02208f52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check out the [Airbyte documentation page](https://docs.airbyte.com/integrations/sources/zendesk-support/) for details about how to configure the reader.\n",
|
||||
"The JSON schema the config object should adhere to can be found on Github: [https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-zendesk-support/source_zendesk_support/spec.json](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-zendesk-support/source_zendesk_support/spec.json).\n",
|
||||
"\n",
|
||||
"The general shape looks like this:\n",
|
||||
"```python\n",
|
||||
"{\n",
|
||||
" \"subdomain\": \"<your zendesk subdomain>\",\n",
|
||||
" \"start_date\": \"<date from which to start retrieving records from in ISO format, e.g. 2020-10-20T00:00:00Z>\",\n",
|
||||
" \"credentials\": {\n",
|
||||
" \"credentials\": \"api_token\",\n",
|
||||
" \"email\": \"<your email>\",\n",
|
||||
" \"api_token\": \"<your api token>\"\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"By default all fields are stored as metadata in the documents and the text is set to an empty string. Construct the text of the document by transforming the documents returned by the reader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "89a99e58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.document_loaders.airbyte import AirbyteZendeskSupportLoader\n",
|
||||
"\n",
|
||||
"config = {\n",
|
||||
" # your zendesk-support configuration\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"loader = AirbyteZendeskSupportLoader(config=config, stream_name=\"tickets\") # check the documentation linked above for a list of all streams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea23fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now you can load documents the usual way"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dae75cdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a93dc2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As `load` returns a list, it will block until all documents are loaded. To have better control over this process, you can also you the `lazy_load` method which returns an iterator instead:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1782db09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_iterator = loader.lazy_load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a124086",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Keep in mind that by default the page content is empty and the metadata object contains all the information from the record. To create documents in a different, pass in a record_handler function when creating the loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5671395d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def handle_record(record, id):\n",
|
||||
" return Document(page_content=record.data[\"title\"], metadata=record.data)\n",
|
||||
"\n",
|
||||
"loader = AirbyteZendeskSupportLoader(config=config, record_handler=handle_record, stream_name=\"tickets\")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "223eb8bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Incremental loads\n",
|
||||
"\n",
|
||||
"Some streams allow incremental loading, this means the source keeps track of synced records and won't load them again. This is useful for sources that have a high volume of data and are updated frequently.\n",
|
||||
"\n",
|
||||
"To take advantage of this, store the `last_state` property of the loader and pass it in when creating the loader again. This will ensure that only new records are loaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7061e735",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_state = loader.last_state # store safely\n",
|
||||
"\n",
|
||||
"incremental_loader = AirbyteZendeskSupportLoader(config=config, stream_name=\"tickets\", state=last_state)\n",
|
||||
"\n",
|
||||
"new_docs = incremental_loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,309 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "62359e08-cf80-4210-a30c-f450000e65b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ArcGIS\n",
|
||||
"\n",
|
||||
"This notebook demonstrates the use of the `langchain.document_loaders.ArcGISLoader` class.\n",
|
||||
"\n",
|
||||
"You will need to install the ArcGIS API for Python `arcgis` and, optionally, `bs4.BeautifulSoup`.\n",
|
||||
"\n",
|
||||
"You can use an `arcgis.gis.GIS` object for authenticated data loading, or leave it blank to access public data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b782cab5-0584-4e2a-9073-009fb8dc93a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import ArcGISLoader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"url = \"https://maps1.vcgov.org/arcgis/rest/services/Beaches/MapServer/7\"\n",
|
||||
"\n",
|
||||
"loader = ArcGISLoader(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "aa3053cf-4127-43ea-bf56-e378b348091f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 7.86 ms, sys: 0 ns, total: 7.86 ms\n",
|
||||
"Wall time: 802 ms\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "a2444519-9117-4feb-8bb9-8931ce286fa5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'accessed': '2023-08-15T04:30:41.689270+00:00Z',\n",
|
||||
" 'name': 'Beach Ramps',\n",
|
||||
" 'url': 'https://maps1.vcgov.org/arcgis/rest/services/Beaches/MapServer/7',\n",
|
||||
" 'layer_description': '(Not Provided)',\n",
|
||||
" 'item_description': '(Not Provided)',\n",
|
||||
" 'layer_properties': {\n",
|
||||
" \"currentVersion\": 10.81,\n",
|
||||
" \"id\": 7,\n",
|
||||
" \"name\": \"Beach Ramps\",\n",
|
||||
" \"type\": \"Feature Layer\",\n",
|
||||
" \"description\": \"\",\n",
|
||||
" \"geometryType\": \"esriGeometryPoint\",\n",
|
||||
" \"sourceSpatialReference\": {\n",
|
||||
" \"wkid\": 2881,\n",
|
||||
" \"latestWkid\": 2881\n",
|
||||
" },\n",
|
||||
" \"copyrightText\": \"\",\n",
|
||||
" \"parentLayer\": null,\n",
|
||||
" \"subLayers\": [],\n",
|
||||
" \"minScale\": 750000,\n",
|
||||
" \"maxScale\": 0,\n",
|
||||
" \"drawingInfo\": {\n",
|
||||
" \"renderer\": {\n",
|
||||
" \"type\": \"simple\",\n",
|
||||
" \"symbol\": {\n",
|
||||
" \"type\": \"esriPMS\",\n",
|
||||
" \"url\": \"9bb2e5ca499bb68aa3ee0d4e1ecc3849\",\n",
|
||||
" \"imageData\": \"iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IB2cksfwAAAAlwSFlzAAAOxAAADsQBlSsOGwAAAJJJREFUOI3NkDEKg0AQRZ9kkSnSGBshR7DJqdJYeg7BMpcS0uQWQsqoCLExkcUJzGqT38zw2fcY1rEzbp7vjXz0EXC7gBxs1ABcG/8CYkCcDqwyLqsV+RlV0I/w7PzuJBArr1VB20H58Ls6h+xoFITkTwWpQJX7XSIBAnFwVj7MLAjJV/AC6G3QoAmK+74Lom04THTBEp/HCSc6AAAAAElFTkSuQmCC\",\n",
|
||||
" \"contentType\": \"image/png\",\n",
|
||||
" \"width\": 12,\n",
|
||||
" \"height\": 12,\n",
|
||||
" \"angle\": 0,\n",
|
||||
" \"xoffset\": 0,\n",
|
||||
" \"yoffset\": 0\n",
|
||||
" },\n",
|
||||
" \"label\": \"\",\n",
|
||||
" \"description\": \"\"\n",
|
||||
" },\n",
|
||||
" \"transparency\": 0,\n",
|
||||
" \"labelingInfo\": null\n",
|
||||
" },\n",
|
||||
" \"defaultVisibility\": true,\n",
|
||||
" \"extent\": {\n",
|
||||
" \"xmin\": -81.09480168806815,\n",
|
||||
" \"ymin\": 28.858349245353473,\n",
|
||||
" \"xmax\": -80.77512908572814,\n",
|
||||
" \"ymax\": 29.41078388840041,\n",
|
||||
" \"spatialReference\": {\n",
|
||||
" \"wkid\": 4326,\n",
|
||||
" \"latestWkid\": 4326\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"hasAttachments\": false,\n",
|
||||
" \"htmlPopupType\": \"esriServerHTMLPopupTypeNone\",\n",
|
||||
" \"displayField\": \"AccessName\",\n",
|
||||
" \"typeIdField\": null,\n",
|
||||
" \"subtypeFieldName\": null,\n",
|
||||
" \"subtypeField\": null,\n",
|
||||
" \"defaultSubtypeCode\": null,\n",
|
||||
" \"fields\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"OBJECTID\",\n",
|
||||
" \"type\": \"esriFieldTypeOID\",\n",
|
||||
" \"alias\": \"OBJECTID\",\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"Shape\",\n",
|
||||
" \"type\": \"esriFieldTypeGeometry\",\n",
|
||||
" \"alias\": \"Shape\",\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AccessName\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"AccessName\",\n",
|
||||
" \"length\": 40,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AccessID\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"AccessID\",\n",
|
||||
" \"length\": 50,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AccessType\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"AccessType\",\n",
|
||||
" \"length\": 25,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"GeneralLoc\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"GeneralLoc\",\n",
|
||||
" \"length\": 100,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"MilePost\",\n",
|
||||
" \"type\": \"esriFieldTypeDouble\",\n",
|
||||
" \"alias\": \"MilePost\",\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"City\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"City\",\n",
|
||||
" \"length\": 50,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AccessStatus\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"AccessStatus\",\n",
|
||||
" \"length\": 50,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"Entry_Date_Time\",\n",
|
||||
" \"type\": \"esriFieldTypeDate\",\n",
|
||||
" \"alias\": \"Entry_Date_Time\",\n",
|
||||
" \"length\": 8,\n",
|
||||
" \"domain\": null\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"DrivingZone\",\n",
|
||||
" \"type\": \"esriFieldTypeString\",\n",
|
||||
" \"alias\": \"DrivingZone\",\n",
|
||||
" \"length\": 50,\n",
|
||||
" \"domain\": null\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"geometryField\": {\n",
|
||||
" \"name\": \"Shape\",\n",
|
||||
" \"type\": \"esriFieldTypeGeometry\",\n",
|
||||
" \"alias\": \"Shape\"\n",
|
||||
" },\n",
|
||||
" \"indexes\": null,\n",
|
||||
" \"subtypes\": [],\n",
|
||||
" \"relationships\": [],\n",
|
||||
" \"canModifyLayer\": true,\n",
|
||||
" \"canScaleSymbols\": false,\n",
|
||||
" \"hasLabels\": false,\n",
|
||||
" \"capabilities\": \"Map,Query,Data\",\n",
|
||||
" \"maxRecordCount\": 1000,\n",
|
||||
" \"supportsStatistics\": true,\n",
|
||||
" \"supportsAdvancedQueries\": true,\n",
|
||||
" \"supportedQueryFormats\": \"JSON, geoJSON\",\n",
|
||||
" \"isDataVersioned\": false,\n",
|
||||
" \"ownershipBasedAccessControlForFeatures\": {\n",
|
||||
" \"allowOthersToQuery\": true\n",
|
||||
" },\n",
|
||||
" \"useStandardizedQueries\": true,\n",
|
||||
" \"advancedQueryCapabilities\": {\n",
|
||||
" \"useStandardizedQueries\": true,\n",
|
||||
" \"supportsStatistics\": true,\n",
|
||||
" \"supportsHavingClause\": true,\n",
|
||||
" \"supportsCountDistinct\": true,\n",
|
||||
" \"supportsOrderBy\": true,\n",
|
||||
" \"supportsDistinct\": true,\n",
|
||||
" \"supportsPagination\": true,\n",
|
||||
" \"supportsTrueCurve\": true,\n",
|
||||
" \"supportsReturningQueryExtent\": true,\n",
|
||||
" \"supportsQueryWithDistance\": true,\n",
|
||||
" \"supportsSqlExpression\": true\n",
|
||||
" },\n",
|
||||
" \"supportsDatumTransformation\": true,\n",
|
||||
" \"dateFieldsTimeReference\": null,\n",
|
||||
" \"supportsCoordinatesQuantization\": true\n",
|
||||
" }}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "1d132b7d-5a13-4d66-98e8-785ffdf87af0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\"OBJECTID\": 4, \"AccessName\": \"BEACHWAY AV\", \"AccessID\": \"NS-106\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1400 N ATLANTIC AV\", \"MilePost\": 1.57, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 5, \"AccessName\": \"SEABREEZE BLVD\", \"AccessID\": \"DB-051\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"500 BLK N ATLANTIC AV\", \"MilePost\": 14.24, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 6, \"AccessName\": \"27TH AV\", \"AccessID\": \"NS-141\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3600 BLK S ATLANTIC AV\", \"MilePost\": 4.83, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 11, \"AccessName\": \"INTERNATIONAL SPEEDWAY BLVD\", \"AccessID\": \"DB-059\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"300 BLK S ATLANTIC AV\", \"MilePost\": 15.27, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 14, \"AccessName\": \"GRANADA BLVD\", \"AccessID\": \"OB-030\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"20 BLK OCEAN SHORE BLVD\", \"MilePost\": 10.02, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 27, \"AccessName\": \"UNIVERSITY BLVD\", \"AccessID\": \"DB-048\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"900 BLK N ATLANTIC AV\", \"MilePost\": 13.74, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 38, \"AccessName\": \"BEACH ST\", \"AccessID\": \"PI-097\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"4890 BLK S ATLANTIC AV\", \"MilePost\": 25.85, \"City\": \"PONCE INLET\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 42, \"AccessName\": \"BOTEFUHR AV\", \"AccessID\": \"DBS-067\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1900 BLK S ATLANTIC AV\", \"MilePost\": 16.68, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 43, \"AccessName\": \"SILVER BEACH AV\", \"AccessID\": \"DB-064\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1000 BLK S ATLANTIC AV\", \"MilePost\": 15.98, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 45, \"AccessName\": \"MILSAP RD\", \"AccessID\": \"OB-037\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"700 BLK S ATLANTIC AV\", \"MilePost\": 11.52, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 56, \"AccessName\": \"3RD AV\", \"AccessID\": \"NS-118\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1200 BLK HILL ST\", \"MilePost\": 3.25, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 64, \"AccessName\": \"DUNLAWTON BLVD\", \"AccessID\": \"DBS-078\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3400 BLK S ATLANTIC AV\", \"MilePost\": 20.61, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 69, \"AccessName\": \"EMILIA AV\", \"AccessID\": \"DBS-082\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3790 BLK S ATLANTIC AV\", \"MilePost\": 21.38, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"BOTH\"}\n",
|
||||
"{\"OBJECTID\": 94, \"AccessName\": \"FLAGLER AV\", \"AccessID\": \"NS-110\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"500 BLK FLAGLER AV\", \"MilePost\": 2.57, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 96, \"AccessName\": \"CRAWFORD RD\", \"AccessID\": \"NS-108\", \"AccessType\": \"OPEN VEHICLE RAMP - PASS\", \"GeneralLoc\": \"800 BLK N ATLANTIC AV\", \"MilePost\": 2.19, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 124, \"AccessName\": \"HARTFORD AV\", \"AccessID\": \"DB-043\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1890 BLK N ATLANTIC AV\", \"MilePost\": 12.76, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 127, \"AccessName\": \"WILLIAMS AV\", \"AccessID\": \"DB-042\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"2200 BLK N ATLANTIC AV\", \"MilePost\": 12.5, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 136, \"AccessName\": \"CARDINAL DR\", \"AccessID\": \"OB-036\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"600 BLK S ATLANTIC AV\", \"MilePost\": 11.27, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 229, \"AccessName\": \"EL PORTAL ST\", \"AccessID\": \"DBS-076\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3200 BLK S ATLANTIC AV\", \"MilePost\": 20.04, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 230, \"AccessName\": \"HARVARD DR\", \"AccessID\": \"OB-038\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"900 BLK S ATLANTIC AV\", \"MilePost\": 11.72, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 232, \"AccessName\": \"VAN AV\", \"AccessID\": \"DBS-075\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3100 BLK S ATLANTIC AV\", \"MilePost\": 19.6, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 234, \"AccessName\": \"ROCKEFELLER DR\", \"AccessID\": \"OB-034\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"400 BLK S ATLANTIC AV\", \"MilePost\": 10.9, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n",
|
||||
"{\"OBJECTID\": 235, \"AccessName\": \"MINERVA RD\", \"AccessID\": \"DBS-069\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"2300 BLK S ATLANTIC AV\", \"MilePost\": 17.52, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"CLOSED\", \"Entry_Date_Time\": 1692039947000, \"DrivingZone\": \"YES\"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for doc in docs:\n",
|
||||
" print(doc.page_content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,101 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ad553e51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Async Chromium\n",
|
||||
"\n",
|
||||
"Chromium is one of the browsers supported by Playwright, a library used to control browser automation. \n",
|
||||
"\n",
|
||||
"By running `p.chromium.launch(headless=True)`, we are launching a headless instance of Chromium. \n",
|
||||
"\n",
|
||||
"Headless mode means that the browser is running without a graphical user interface.\n",
|
||||
"\n",
|
||||
"`AsyncChromiumLoader` load the page, and then we use `Html2TextTransformer` to trasnform to text."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1c3a4c19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install -q playwright beautifulsoup4\n",
|
||||
"! playwright install"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "dd2cdea7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'<!DOCTYPE html><html lang=\"en\"><head><script src=\"https://s0.2mdn.net/instream/video/client.js\" asyn'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AsyncChromiumLoader\n",
|
||||
"urls = [\"https://www.wsj.com\"]\n",
|
||||
"loader = AsyncChromiumLoader(urls)\n",
|
||||
"docs = loader.load()\n",
|
||||
"docs[0].page_content[0:100]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "013caa7e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"Skip to Main ContentSkip to SearchSkip to... Select * Top News * What's News *\\nFeatured Stories * Retirement * Life & Arts * Hip-Hop * Sports * Video *\\nEconomy * Real Estate * Sports * CMO * CIO * CFO * Risk & Compliance *\\nLogistics Report * Sustainable Business * Heard on the Street * Barron’s *\\nMarketWatch * Mansion Global * Penta * Opinion * Journal Reports * Sponsored\\nOffers Explore Our Brands * WSJ * * * * * Barron's * * * * * MarketWatch * * *\\n* * IBD # The Wall Street Journal SubscribeSig\""
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_transformers import Html2TextTransformer\n",
|
||||
"html2text = Html2TextTransformer()\n",
|
||||
"docs_transformed = html2text.transform_documents(docs)\n",
|
||||
"docs_transformed[0].page_content[0:500]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,94 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "23c6e167",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Concurrent Loader\n",
|
||||
"\n",
|
||||
"Works just like the GenericLoader but concurrently for those who choose to optimize their workflow.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "6ff3fb1f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import ConcurrentLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ce96fa20",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = ConcurrentLoader.from_filesystem('example_data/', glob=\"**/*.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "06a6cf5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "b87d3e58",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(files)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "668f1ee5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<opml version="1.0">
|
||||
<head>
|
||||
<title>Sample RSS feed subscriptions</title>
|
||||
</head>
|
||||
<body>
|
||||
<outline text="Tech" title="Tech">
|
||||
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
|
||||
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
|
||||
</outline>
|
||||
</body>
|
||||
</opml>
|
||||
@@ -73,27 +73,13 @@
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41c8a46f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to use an alternative loader, you can provide a custom function, for example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eba3002d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PyPDFLoader\n",
|
||||
"def load_pdf(file_path):\n",
|
||||
" return PyPDFLoader(file_path)\n",
|
||||
"\n",
|
||||
"loader = GCSFileLoader(project_name=\"aist\", bucket=\"testing-hwc\", blob=\"fake.pdf\", loader_func=load_pdf)"
|
||||
]
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -9,16 +9,66 @@
|
||||
"\n",
|
||||
"GROBID is a machine learning library for extracting, parsing, and re-structuring raw documents.\n",
|
||||
"\n",
|
||||
"It is designed and expected to be used to parse academic papers, where it works particularly well. Note: if the articles supplied to Grobid are large documents (e.g. dissertations) exceeding a certain number of elements, they might not be processed. \n",
|
||||
"It is particularly good for sturctured PDFs, like academic papers.\n",
|
||||
"\n",
|
||||
"This loader uses Grobid to parse PDFs into `Documents` that retain metadata associated with the section of text.\n",
|
||||
"This loader uses GROBIB to parse PDFs into `Documents` that retain metadata associated with the section of text.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"The best approach is to install Grobid via docker, see https://grobid.readthedocs.io/en/latest/Grobid-docker/. \n",
|
||||
"\n",
|
||||
"(Note: additional instructions can be found [here](https://python.langchain.com/docs/extras/integrations/providers/grobid.mdx).)\n",
|
||||
"For users on `Mac` - \n",
|
||||
"\n",
|
||||
"Once grobid is up-and-running you can interact as described below. \n"
|
||||
"(Note: additional instructions can be found [here](https://python.langchain.com/docs/ecosystem/integrations/grobid.mdx).)\n",
|
||||
"\n",
|
||||
"Install Java (Apple Silicon):\n",
|
||||
"```\n",
|
||||
"$ arch -arm64 brew install openjdk@11\n",
|
||||
"$ brew --prefix openjdk@11\n",
|
||||
"/opt/homebrew/opt/openjdk@ 11\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"In `~/.zshrc`:\n",
|
||||
"```\n",
|
||||
"export JAVA_HOME=/opt/homebrew/opt/openjdk@11\n",
|
||||
"export PATH=$JAVA_HOME/bin:$PATH\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Then, in Terminal:\n",
|
||||
"```\n",
|
||||
"$ source ~/.zshrc\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Confirm install:\n",
|
||||
"```\n",
|
||||
"$ which java\n",
|
||||
"/opt/homebrew/opt/openjdk@11/bin/java\n",
|
||||
"$ java -version \n",
|
||||
"openjdk version \"11.0.19\" 2023-04-18\n",
|
||||
"OpenJDK Runtime Environment Homebrew (build 11.0.19+0)\n",
|
||||
"OpenJDK 64-Bit Server VM Homebrew (build 11.0.19+0, mixed mode)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Then, get [Grobid](https://grobid.readthedocs.io/en/latest/Install-Grobid/#getting-grobid):\n",
|
||||
"```\n",
|
||||
"$ curl -LO https://github.com/kermitt2/grobid/archive/0.7.3.zip\n",
|
||||
"$ unzip 0.7.3.zip\n",
|
||||
"```\n",
|
||||
" \n",
|
||||
"Build\n",
|
||||
"```\n",
|
||||
"$ ./gradlew clean install\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Then, run the server:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2d8992fc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! get_ipython().system_raw('nohup ./gradlew run > grobid.log 2>&1 &')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c83b6a4c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Huawei OBS Directory\n",
|
||||
"The following code demonstrates how to load objects from the Huawei OBS (Object Storage Service) as documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2191935",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install the required package\n",
|
||||
"# pip install esdk-obs-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "55fca3b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import OBSDirectoryLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c3ed419f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint = \"your-endpoint\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "3428fd4e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Configure your access credentials\\n\n",
|
||||
"config = {\n",
|
||||
" \"ak\": \"your-access-key\",\n",
|
||||
" \"sk\": \"your-secret-key\"\n",
|
||||
"}\n",
|
||||
"loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9beede9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e20a839",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Specify a Prefix for Loading\n",
|
||||
"If you want to load objects with a specific prefix from the bucket, you can use the following code:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "125f311d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config, prefix=\"test_prefix\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3488037",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "84c82c0a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Authentication Information from ECS\n",
|
||||
"If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "1db99969",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config = {\"get_token_from_ecs\": True}\n",
|
||||
"loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57dd9f35",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "30205d25",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Use a Public Bucket\n",
|
||||
"If your bucket's bucket policy allows anonymous access (anonymous users have `listBucket` and `GetObject` permissions), you can directly load the objects without configuring the `config` parameter."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "4dfa2ef0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67d4c1d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,180 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4394a872",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Huawei OBS File\n",
|
||||
"The following code demonstrates how to load an object from the Huawei OBS (Object Storage Service) as document."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c43d811b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install the required package\n",
|
||||
"# pip install esdk-obs-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5e16bae6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders.obs_file import OBSFileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "75cc7e7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint = \"your-endpoint\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f9816984",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from obs import ObsClient\n",
|
||||
"obs_client = ObsClient(access_key_id=\"your-access-key\", secret_access_key=\"your-secret-key\", server=endpoint)\n",
|
||||
"loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", client=obs_client)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6143b39b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "633e05ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Each Loader with Separate Authentication Information\n",
|
||||
"If you don't need to reuse OBS connections between different loaders, you can directly configure the `config`. The loader will use the config information to initialize its own OBS client."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a5dd6a5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Configure your access credentials\\n\n",
|
||||
"config = {\n",
|
||||
" \"ak\": \"your-access-key\",\n",
|
||||
" \"sk\": \"your-secret-key\"\n",
|
||||
"}\n",
|
||||
"loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\",endpoint=endpoint, config=config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a741f1c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e2e611c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Authentication Information from ECS\n",
|
||||
"If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "338fafef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config = {\"get_token_from_ecs\": True}\n",
|
||||
"loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint, config=config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "73976c55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b77aa18c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Access a Publicly Accessible Object\n",
|
||||
"If the object you want to access allows anonymous user access (anonymous users have `GetObject` permission), you can directly load the object without configuring the `config` parameter."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "df83d121",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82a844ba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,192 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# News URL\n",
|
||||
"\n",
|
||||
"This covers how to load HTML news articles from a list of URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "16c3699e",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:18.886031400Z",
|
||||
"start_time": "2023-08-02T21:18:17.682345Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import NewsURLLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "836fbac1",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:18.895539800Z",
|
||||
"start_time": "2023-08-02T21:18:18.895539800Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\n",
|
||||
" \"https://www.bbc.com/news/world-us-canada-66388172\",\n",
|
||||
" \"https://www.bbc.com/news/entertainment-arts-66384971\",\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "33089aba-ff74-4d00-8f40-9449c29587cc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pass in urls to load them into Documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "00f46fda",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:19.227074500Z",
|
||||
"start_time": "2023-08-02T21:18:18.895539800Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"First article: page_content='In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact. Neither she nor her representatives have commented.' metadata={'title': 'Donald Trump indictment: What do we know about the six co-conspirators?', 'link': 'https://www.bbc.com/news/world-us-canada-66388172', 'authors': [], 'language': 'en', 'description': 'Six people accused of helping Mr Trump undermine the election have been described by prosecutors.', 'publish_date': None}\n",
|
||||
"\n",
|
||||
"Second article: page_content='Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"' metadata={'title': \"Lizzo dancers Arianna Davis and Crystal Williams: 'No one speaks out, they are scared'\", 'link': 'https://www.bbc.com/news/entertainment-arts-66384971', 'authors': [], 'language': 'en', 'description': 'The US pop star is being sued for sexual harassment and fat-shaming but has yet to comment.', 'publish_date': None}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = NewsURLLoader(urls=urls)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(\"First article: \", data[0])\n",
|
||||
"print(\"\\nSecond article: \", data[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Use nlp=True to run nlp analysis and generate keywords + summary"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "98ac26c488315bff"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b68a26b3",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:19.585758200Z",
|
||||
"start_time": "2023-08-02T21:18:19.227074500Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"First article: page_content='In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact. Neither she nor her representatives have commented.' metadata={'title': 'Donald Trump indictment: What do we know about the six co-conspirators?', 'link': 'https://www.bbc.com/news/world-us-canada-66388172', 'authors': [], 'language': 'en', 'description': 'Six people accused of helping Mr Trump undermine the election have been described by prosecutors.', 'publish_date': None, 'keywords': ['powell', 'know', 'donald', 'trump', 'review', 'indictment', 'telling', 'view', 'reasonable', 'person', 'testimony', 'coconspirators', 'riot', 'representatives', 'claims'], 'summary': 'In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact.\\nNeither she nor her representatives have commented.'}\n",
|
||||
"\n",
|
||||
"Second article: page_content='Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"' metadata={'title': \"Lizzo dancers Arianna Davis and Crystal Williams: 'No one speaks out, they are scared'\", 'link': 'https://www.bbc.com/news/entertainment-arts-66384971', 'authors': [], 'language': 'en', 'description': 'The US pop star is being sued for sexual harassment and fat-shaming but has yet to comment.', 'publish_date': None, 'keywords': ['davis', 'lizzo', 'singers', 'experience', 'crystal', 'ensure', 'arianna', 'theres', 'williams', 'power', 'going', 'dancers', 'im', 'speaks', 'work', 'ms', 'scared'], 'summary': 'Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = NewsURLLoader(urls=urls, nlp=True)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(\"First article: \", data[0])\n",
|
||||
"print(\"\\nSecond article: \", data[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "['powell',\n 'know',\n 'donald',\n 'trump',\n 'review',\n 'indictment',\n 'telling',\n 'view',\n 'reasonable',\n 'person',\n 'testimony',\n 'coconspirators',\n 'riot',\n 'representatives',\n 'claims']"
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0].metadata['keywords']"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:19.585758200Z",
|
||||
"start_time": "2023-08-02T21:18:19.585758200Z"
|
||||
}
|
||||
},
|
||||
"id": "ae37e004e0284b1d"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact.\\nNeither she nor her representatives have commented.'"
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0].metadata['summary']"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-08-02T21:18:19.598966800Z",
|
||||
"start_time": "2023-08-02T21:18:19.594950200Z"
|
||||
}
|
||||
},
|
||||
"id": "7676155fb175e53e"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,144 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Nuclia Understanding API document loader\n",
|
||||
"\n",
|
||||
"[Nuclia](https://nuclia.com) automatically indexes your unstructured data from any internal and external source, providing optimized search results and generative answers. It can handle video and audio transcription, image content extraction, and document parsing.\n",
|
||||
"\n",
|
||||
"The Nuclia Understanding API supports the processing of unstructured data, including text, web pages, documents, and audio/video contents. It extracts all texts wherever they are (using speech-to-text or OCR when needed), it also extracts metadata, embedded files (like images in a PDF), and web links. If machine learning is enabled, it identifies entities, provides a summary of the content and generates embeddings for all the sentences.\n",
|
||||
"\n",
|
||||
"To use the Nuclia Understanding API, you need to have a Nuclia account. You can create one for free at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install --upgrade protobuf\n",
|
||||
"#!pip install nucliadb-protos"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"NUCLIA_ZONE\"] = \"<YOUR_ZONE>\" # e.g. europe-1\n",
|
||||
"os.environ[\"NUCLIA_NUA_KEY\"] = \"<YOUR_API_KEY>\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To use the Nuclia document loader, you need to instantiate a `NucliaUnderstandingAPI` tool:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.tools.nuclia import NucliaUnderstandingAPI\n",
|
||||
"\n",
|
||||
"nua = NucliaUnderstandingAPI(enable_ml=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders.nuclia import NucliaLoader\n",
|
||||
"\n",
|
||||
"loader = NucliaLoader(\"./interview.mp4\", nua)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can now call the `load` the document in a loop until you get the document."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"pending = True\n",
|
||||
"while pending:\n",
|
||||
" time.sleep(15)\n",
|
||||
" docs = loader.load()\n",
|
||||
" if len(docs) > 0:\n",
|
||||
" print(docs[0].page_content)\n",
|
||||
" print(docs[0].metadata)\n",
|
||||
" pending = False\n",
|
||||
" else:\n",
|
||||
" print(\"waiting...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retrieved information\n",
|
||||
"\n",
|
||||
"Nuclia returns the following information:\n",
|
||||
"\n",
|
||||
"- file metadata\n",
|
||||
"- extracted text\n",
|
||||
"- nested text (like text in an embedded image)\n",
|
||||
"- paragraphs and sentences splitting (defined by the position of their first and last characters, plus start time and end time for a video or audio file)\n",
|
||||
"- links\n",
|
||||
"- a thumbnail\n",
|
||||
"- embedded files\n",
|
||||
"\n",
|
||||
"Note:\n",
|
||||
"\n",
|
||||
" Generated files (thumbnail, extracted embedded files, etc.) are provided as a token. You can download them with the [`/processing/download` endpoint](https://docs.nuclia.dev/docs/api#operation/Download_binary_file_processing_download_get).\n",
|
||||
"\n",
|
||||
" Also at any level, if an attribute exceeds a certain size, it will be put in a downloadable file and will be replaced in the document by a file pointer. This will consist of `{\"file\": {\"uri\": \"JWT_TOKEN\"}}`. The rule is that if the size of the message is greater than 1000000 characters, the biggest parts will be moved to downloadable files. First, the compression process will target vectors. If that is not enough, it will target large field metadata, and finally it will target extracted text.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.5"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,139 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3df0dcf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PubMed\n",
|
||||
"\n",
|
||||
">[PubMed®](https://pubmed.ncbi.nlm.nih.gov/) by `The National Center for Biotechnology Information, National Library of Medicine` comprises more than 35 million citations for biomedical literature from `MEDLINE`, life science journals, and online books. Citations may include links to full text content from `PubMed Central` and publisher web sites."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "aecaff63",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PubMedLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f2f7e8d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PubMedLoader(\"chatgpt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ed115aa1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b68d3264-b893-45e4-8ab0-077b25a586dc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"3"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "9f4626d2-068d-4aed-9ffe-ad754ad4b4cd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'uid': '37548997',\n",
|
||||
" 'Title': 'Performance of ChatGPT on the Situational Judgement Test-A Professional Dilemmas-Based Examination for Doctors in the United Kingdom.',\n",
|
||||
" 'Published': '2023-08-07',\n",
|
||||
" 'Copyright Information': '©Robin J Borchert, Charlotte R Hickman, Jack Pepys, Timothy J Sadler. Originally published in JMIR Medical Education (https://mededu.jmir.org), 07.08.2023.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[1].metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "8000f687-b500-4cce-841b-70d6151304da",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"BACKGROUND: ChatGPT is a large language model that has performed well on professional examinations in the fields of medicine, law, and business. However, it is unclear how ChatGPT would perform on an examination assessing professionalism and situational judgement for doctors.\\nOBJECTIVE: We evaluated the performance of ChatGPT on the Situational Judgement Test (SJT): a national examination taken by all final-year medical students in the United Kingdom. This examination is designed to assess attributes such as communication, teamwork, patient safety, prioritization skills, professionalism, and ethics.\\nMETHODS: All questions from the UK Foundation Programme Office's (UKFPO's) 2023 SJT practice examination were inputted into ChatGPT. For each question, ChatGPT's answers and rationales were recorded and assessed on the basis of the official UK Foundation Programme Office scoring template. Questions were categorized into domains of Good Medical Practice on the basis of the domains referenced in the rationales provided in the scoring sheet. Questions without clear domain links were screened by reviewers and assigned one or multiple domains. ChatGPT's overall performance, as well as its performance across the domains of Good Medical Practice, was evaluated.\\nRESULTS: Overall, ChatGPT performed well, scoring 76% on the SJT but scoring full marks on only a few questions (9%), which may reflect possible flaws in ChatGPT's situational judgement or inconsistencies in the reasoning across questions (or both) in the examination itself. ChatGPT demonstrated consistent performance across the 4 outlined domains in Good Medical Practice for doctors.\\nCONCLUSIONS: Further research is needed to understand the potential applications of large language models, such as ChatGPT, in medical education for standardizing questions and providing consistent rationales for examinations assessing professionalism and ethics.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[1].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1070e571-697d-4c33-9a4f-0b2dd6909629",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -9,7 +9,7 @@
|
||||
"\n",
|
||||
"We may want to process load all URLs under a root directory.\n",
|
||||
"\n",
|
||||
"For example, let's look at the [Python 3.9 Document](https://docs.python.org/3.9/).\n",
|
||||
"For example, let's look at the [LangChain JS documentation](https://js.langchain.com/docs/).\n",
|
||||
"\n",
|
||||
"This has many interesting child pages that we may want to read in bulk.\n",
|
||||
"\n",
|
||||
@@ -19,28 +19,13 @@
|
||||
" \n",
|
||||
"We do this using the `RecursiveUrlLoader`.\n",
|
||||
"\n",
|
||||
"This also gives us the flexibility to exclude some children, customize the extractor, and more."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1be8094f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Parameters\n",
|
||||
"- url: str, the target url to crawl.\n",
|
||||
"- exclude_dirs: Optional[str], webpage directories to exclude.\n",
|
||||
"- use_async: Optional[bool], wether to use async requests, using async requests is usually faster in large tasks. However, async will disable the lazy loading feature(the function still works, but it is not lazy). By default, it is set to False.\n",
|
||||
"- extractor: Optional[Callable[[str], str]], a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like goose3 and beautifulsoup to extract the text. By default, it just returns the page as it is.\n",
|
||||
"- max_depth: Optional[int] = None, the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job.\n",
|
||||
"- timeout: Optional[int] = None, the timeout for each request, in the unit of seconds. By default, it is set to 10.\n",
|
||||
"- prevent_outside: Optional[bool] = None, whether to prevent crawling outside the root url. By default, it is set to True."
|
||||
"This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23c18539",
|
||||
"execution_count": 1,
|
||||
"id": "2e3532b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -57,15 +42,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "55394afe",
|
||||
"execution_count": 2,
|
||||
"id": "d69e5620",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from bs4 import BeautifulSoup as Soup\n",
|
||||
"\n",
|
||||
"url = \"https://docs.python.org/3.9/\"\n",
|
||||
"loader = RecursiveUrlLoader(url=url, max_depth=2, extractor=lambda x: Soup(x, \"html.parser\").text)\n",
|
||||
"url = \"https://js.langchain.com/docs/modules/memory/examples/\"\n",
|
||||
"loader = RecursiveUrlLoader(url=url)\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
@@ -78,7 +61,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\n\\n\\n\\nPython Frequently Asked Questions — Python 3.'"
|
||||
"12"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
@@ -86,71 +69,158 @@
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "89355b7c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\n\\n\\n\\nBuffer Window Memory | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\nSki'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].page_content[:50]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"id": "13bd7e16",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'source': 'https://docs.python.org/3.9/library/index.html',\n",
|
||||
" 'title': 'The Python Standard Library — Python 3.9.17 documentation',\n",
|
||||
" 'language': None}"
|
||||
"{'source': 'https://js.langchain.com/docs/modules/memory/examples/buffer_window_memory',\n",
|
||||
" 'title': 'Buffer Window Memory | 🦜️🔗 Langchain',\n",
|
||||
" 'description': 'BufferWindowMemory keeps track of the back-and-forths in conversation, and then uses a window of size k to surface the last k back-and-forths to use as memory.',\n",
|
||||
" 'language': 'en'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[-1].metadata"
|
||||
"docs[0].metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5866e5a6",
|
||||
"id": "40fc13ef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4ec8ecef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Testing on LangChain docs."
|
||||
"Now, let's try a more extensive example, the `docs` root dir.\n",
|
||||
"\n",
|
||||
"We will skip everything under `api`.\n",
|
||||
"\n",
|
||||
"For this, we can `lazy_load` each page as we crawl the tree, using `WebBaseLoader` to load each as we go."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "349b5598",
|
||||
"execution_count": null,
|
||||
"id": "5c938b9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = \"https://js.langchain.com/docs/\"\n",
|
||||
"exclude_dirs = [\"https://js.langchain.com/docs/api/\"]\n",
|
||||
"loader = RecursiveUrlLoader(url=url, exclude_dirs=exclude_dirs)\n",
|
||||
"# Lazy load each\n",
|
||||
"docs = [print(doc) or doc for doc in loader.lazy_load()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "30ff61d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load all pages\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "457e30f3",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"188"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "bca80b4a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"8"
|
||||
"'\\n\\n\\n\\n\\nAgent Simulations | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\nSkip t'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"url = \"https://js.langchain.com/docs/modules/memory/integrations/\"\n",
|
||||
"loader = RecursiveUrlLoader(url=url)\n",
|
||||
"docs = loader.load()\n",
|
||||
"len(docs)"
|
||||
"docs[0].page_content[:50]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "df97cf22",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'source': 'https://js.langchain.com/docs/use_cases/agent_simulations/',\n",
|
||||
" 'title': 'Agent Simulations | 🦜️🔗 Langchain',\n",
|
||||
" 'description': 'Agent simulations involve taking multiple agents and having them interact with each other.',\n",
|
||||
" 'language': 'en'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].metadata"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
@@ -1,311 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RSS Feeds\n",
|
||||
"\n",
|
||||
"This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e7c2cd52-c1f7-4a06-8539-b0117da91fba",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install feedparser newspaper3k listparser"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "16c3699e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import RSSFeedLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "836fbac1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\"https://news.ycombinator.com/rss\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "33089aba-ff74-4d00-8f40-9449c29587cc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pass in urls to load them into Documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00f46fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = RSSFeedLoader(urls=urls)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "b447468cc42266d0",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(next Rich)\n",
|
||||
"\n",
|
||||
"04 August 2023\n",
|
||||
"\n",
|
||||
"Rich Hickey\n",
|
||||
"\n",
|
||||
"It is with a mixture of heartache and optimism that I announce today my (long planned) retirement from commercial software development, and my employment at Nubank. It’s been thrilling to see Clojure and Datomic successfully applied at scale.\n",
|
||||
"\n",
|
||||
"I look forward to continuing to lead ongoing work maintaining and enhancing Clojure with Alex, Stu, Fogus and many others, as an independent developer once again. We have many useful things planned for 1.12 and beyond. The community remains friendly, mature and productive, and is taking Clojure into many interesting new domains.\n",
|
||||
"\n",
|
||||
"I want to highlight and thank Nubank for their ongoing sponsorship of Alex, Fogus and the core team, as well as the Clojure community at large.\n",
|
||||
"\n",
|
||||
"Stu will continue to lead the development of Datomic at Nubank, where the Datomic team grows and thrives. I’m particularly excited to see where the new free availability of Datomic will lead.\n",
|
||||
"\n",
|
||||
"My time with Cognitect remains the highlight of my career. I have learned from absolutely everyone on our team, and am forever grateful to all for our interactions. There are too many people to thank here, but I must extend my sincerest appreciation and love to Stu and Justin for (repeatedly) taking a risk on me and my ideas, and for being the best of partners and friends, at all times fully embodying the notion of integrity. And of course to Alex Miller - who possesses in abundance many skills I lack, and without whose indomitable spirit, positivity and friendship Clojure would not have become what it did.\n",
|
||||
"\n",
|
||||
"I have made many friends through Clojure and Cognitect, and I hope to nurture those friendships moving forward.\n",
|
||||
"\n",
|
||||
"Retirement returns me to the freedom and independence I had when originally developing Clojure. The journey continues!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c36d3b0d329faf2a",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You can pass arguments to the NewsURLLoader which it uses to load articles."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "5fdada62470d3019",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error fetching or processing https://twitter.com/andrewmccalip/status/1687405505604734978, exception: You must `parse()` an article first!\n",
|
||||
"Error processing entry https://twitter.com/andrewmccalip/status/1687405505604734978, exception: list index out of range\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"13\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = RSSFeedLoader(urls=urls, nlp=True)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "11d71963f7735c1d",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['nubank',\n",
|
||||
" 'alex',\n",
|
||||
" 'stu',\n",
|
||||
" 'taking',\n",
|
||||
" 'team',\n",
|
||||
" 'remains',\n",
|
||||
" 'rich',\n",
|
||||
" 'clojure',\n",
|
||||
" 'thank',\n",
|
||||
" 'planned',\n",
|
||||
" 'datomic']"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0].metadata['keywords']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "9fb64ba0e8780966",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'It’s been thrilling to see Clojure and Datomic successfully applied at scale.\\nI look forward to continuing to lead ongoing work maintaining and enhancing Clojure with Alex, Stu, Fogus and many others, as an independent developer once again.\\nThe community remains friendly, mature and productive, and is taking Clojure into many interesting new domains.\\nI want to highlight and thank Nubank for their ongoing sponsorship of Alex, Fogus and the core team, as well as the Clojure community at large.\\nStu will continue to lead the development of Datomic at Nubank, where the Datomic team grows and thrives.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0].metadata['summary']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "98ac26c488315bff",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You can also use an OPML file such as a Feedly export. Pass in either a URL or the OPML contents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "8b6f07ae526a897c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error fetching http://www.engadget.com/rss-full.xml, exception: Error fetching http://www.engadget.com/rss-full.xml, exception: document declared as us-ascii, but parsed as utf-8\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"20\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
|
||||
" loader = RSSFeedLoader(opml=f.read())\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "b68a26b3",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The electric vehicle startup Fisker made a splash in Huntington Beach last night, showing off a range of new EVs it plans to build alongside the Fisker Ocean, which is slowly beginning deliveries in Europe and the US. With shades of Lotus circa 2010, it seems there\\'s something for most tastes, with a powerful four-door GT, a versatile pickup truck, and an affordable electric city car.\\n\\n\"We want the world to know that we have big plans and intend to move into several different segments, redefining each with our unique blend of design, innovation, and sustainability,\" said CEO Henrik Fisker.\\n\\nStarting with the cheapest, the Fisker PEAR—a cutesy acronym for \"Personal Electric Automotive Revolution\"—is said to use 35 percent fewer parts than other small EVs. Although it\\'s a smaller car, the PEAR seats six thanks to front and rear bench seats. Oh, and it has a frunk, which the company is calling the \"froot,\" something that will satisfy some British English speakers like Ars\\' friend and motoring journalist Jonny Smith.\\n\\nBut most exciting is the price—starting at $29,900 and scheduled for 2025. Fisker plans to contract with Foxconn to build the PEAR in Lordstown, Ohio, meaning it would be eligible for federal tax incentives.\\n\\nAdvertisement\\n\\nThe Fisker Alaska is the company\\'s pickup truck, built on a modified version of the platform used by the Ocean. It has an extendable cargo bed, which can be as little as 4.5 feet (1,371 mm) or as much as 9.2 feet (2,804 mm) long. Fisker claims it will be both the lightest EV pickup on sale and the most sustainable pickup truck in the world. Range will be an estimated 230–240 miles (370–386 km).\\n\\nThis, too, is slated for 2025, and also at a relatively affordable price, starting at $45,400. Fisker hopes to build this car in North America as well, although it isn\\'t saying where that might take place.\\n\\nFinally, there\\'s the Ronin, a four-door GT that bears more than a passing resemblance to the Fisker Karma, Henrik Fisker\\'s 2012 creation. There\\'s no price for this one, but Fisker says its all-wheel drive powertrain will boast 1,000 hp (745 kW) and will hit 60 mph from a standing start in two seconds—just about as fast as modern tires will allow. Expect a massive battery in this one, as Fisker says it\\'s targeting a 600-mile (956 km) range.\\n\\n\"Innovation and sustainability, along with design, are our three brand values. By 2027, we intend to produce the world’s first climate-neutral vehicle, and as our customers reinvent their relationships with mobility, we want to be a leader in software-defined transportation,\" Fisker said.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d5a0cbe8-18a6-4af2-b447-7abb8b734451",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "poetry-venv",
|
||||
"language": "python",
|
||||
"name": "poetry-venv"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||