Synthetic data generation (#9759)

### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-23 19:39:58 +00:00 · 2023-09-20 01:29:50 +02:00
parent c4a6de3fc9
commit a29cd89923
3 changed files with 503 additions and 0 deletions
--- a/libs/experimental/langchain_experimental/synthetic_data/init.py
+++ b/libs/experimental/langchain_experimental/synthetic_data/init.py
@@ -0,0 +1,51 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from langchain.chains.llm import LLMChain
+
+from langchain_experimental.synthetic_data.prompts import SENTENCE_PROMPT
+
+if TYPE_CHECKING:
+    from langchain.chains.base import Chain
+    from langchain.prompts import PromptTemplate
+    from langchain.schema.language_model import BaseLanguageModel
+
+
+def create_data_generation_chain(
+    llm: BaseLanguageModel,
+    prompt: Optional[PromptTemplate] = None,
+) -> Chain:
+    """Creates a chain that generates synthetic sentences with
+     provided fields.
+
+    Args:
+        llm: The language model to use.
+        prompt: Prompt to feed the language model with.
+        If not provided, the default one will be used.
+    """
+    prompt = prompt or SENTENCE_PROMPT
+    return LLMChain(
+        llm=llm,
+        prompt=prompt,
+    )
+
+
+class DatasetGenerator:
+    """Generates synthetic dataset with a given language model."""
+
+    def __init__(
+        self,
+        llm: BaseLanguageModel,
+        sentence_preferences: Optional[Dict[str, Any]] = None,
+    ):
+        self.generator = create_data_generation_chain(llm)
+        self.sentence_preferences = sentence_preferences or {}
+
+    def __call__(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]:
+        results: List[Dict[str, Any]] = []
+        for fields in fields_collection:
+            results.append(
+                self.generator(
+                    {"fields": fields, "preferences": self.sentence_preferences}
+                )
+            )
+        return results
--- a/libs/experimental/langchain_experimental/synthetic_data/prompts.py
+++ b/libs/experimental/langchain_experimental/synthetic_data/prompts.py
@@ -0,0 +1,15 @@
+from langchain.prompts.prompt import PromptTemplate
+
+sentence_template = """Given the following fields, create a sentence about them. 
+Make the sentence detailed and interesting. Use every given field.
+If any additional preferences are given, use them during sentence construction as well.
+Fields:
+{fields}
+Preferences:
+{preferences}
+Sentence:
+"""
+
+SENTENCE_PROMPT = PromptTemplate(
+    template=sentence_template, input_variables=["fields", "preferences"]
+)