Compare commits

...

2 Commits

Author SHA1 Message Date
Harrison Chase
92dc7b5f20 structured data extraction 2023-03-11 13:06:41 -08:00
Harrison Chase
c68d3f699b stash 2023-03-11 13:06:01 -08:00
4 changed files with 148 additions and 0 deletions

View File

@@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9bb1293b",
"metadata": {},
"source": [
"# Structured Data Extraction\n",
"\n",
"This notebook goes over how to use a chain to extract structured data from text.\n",
"\n",
"This heavily utilizes the fabulous [kor library](https://eyurtsev.github.io/kor/index.html). As a result, it only works for Python 3.10+"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82ba478c",
"metadata": {},
"outputs": [],
"source": [
"!pip install kor"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d4b12abd",
"metadata": {},
"outputs": [],
"source": [
"from kor.extraction import Extractor\n",
"from kor.nodes import Object, Text, Number\n",
"from kor.llms import OpenAIChatCompletion, OpenAICompletion"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "15726529",
"metadata": {},
"outputs": [],
"source": [
"llm = OpenAIChatCompletion(model=\"gpt-3.5-turbo\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8c49e1ba",
"metadata": {},
"outputs": [],
"source": [
"model = Extractor(llm)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "752e408d",
"metadata": {},
"outputs": [],
"source": [
"schema = Text(\n",
" id=\"first_name\",\n",
" description=\"The first name of a person\",\n",
" examples=[(\"I am billy.\", \"billy\"), (\"John Smith is 33 years old\", \"John\")],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50fe683e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -25,6 +25,7 @@ from langchain.chains.sql_database.base import (
)
from langchain.chains.transform import TransformChain
from langchain.chains.vector_db_qa.base import VectorDBQA
from langchain.chains.structured_data_extraction.base import StructuredDataExtractionChain
__all__ = [
"ConversationChain",
@@ -52,4 +53,5 @@ __all__ = [
"ChatVectorDBChain",
"GraphQAChain",
"ConstitutionalChain",
"StructuredDataExtractionChain"
]

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from typing import Dict, List, Any
from langchain.chains.base import Chain
from pydantic import root_validator
class StructuredDataExtractionChain(Chain):
kor_extractor: Any
kor_schema: Any
input_key: str = "text"
output_key: str = "info"
@property
def _chain_type(self) -> str:
raise NotImplementedError
@property
def input_keys(self) -> List[str]:
return [self.input_key]
@property
def output_keys(self) -> List[str]:
return [self.output_key]
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
try:
import kor
except ImportError:
raise ValueError(
"Could not import kor python package. "
"Please it install it with `pip install kor`."
)
return values
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
result = self.kor_extractor(inputs[self.input_key], self.kor_schema)
return {self.output_key: result}
async def _acall(self, inputs: Dict[str, str]) -> Dict[str, str]:
pass