diff --git a/cookbook/together_ai.ipynb b/cookbook/together_ai.ipynb new file mode 100644 index 00000000000..1266073e83c --- /dev/null +++ b/cookbook/together_ai.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0fc0309d-4d49-4bb5-bec0-bd92c6fddb28", + "metadata": {}, + "source": [ + "## Together AI + RAG\n", + " \n", + "[Together AI](https://python.langchain.com/docs/integrations/llms/together) has a broad set of OSS LLMs via inference API.\n", + "\n", + "See [here](https://api.together.xyz/playground). We use `\"mistralai/Mixtral-8x7B-Instruct-v0.1` for RAG on the Mixtral paper.\n", + "\n", + "Download the paper:\n", + "https://arxiv.org/pdf/2401.04088.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d12fb75a-f707-48d5-82a5-efe2d041813c", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install --quiet pypdf chromadb tiktoken openai langchain-together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ab49327-0532-4480-804c-d066c302a322", + "metadata": {}, + "outputs": [], + "source": [ + "# Load\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "\n", + "loader = PyPDFLoader(\"~/Desktop/mixtral.pdf\")\n", + "data = loader.load()\n", + "\n", + "# Split\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n", + "all_splits = text_splitter.split_documents(data)\n", + "\n", + "# Add to vectorDB\n", + "from langchain_community.embeddings import OpenAIEmbeddings\n", + "from langchain_community.vectorstores import Chroma\n", + "\n", + "\"\"\"\n", + "from langchain_together.embeddings import TogetherEmbeddings\n", + "embeddings = TogetherEmbeddings(model=\"togethercomputer/m2-bert-80M-8k-retrieval\")\n", + "\"\"\"\n", + "vectorstore = Chroma.from_documents(\n", + " documents=all_splits,\n", + " collection_name=\"rag-chroma\",\n", + " embedding=OpenAIEmbeddings(),\n", + ")\n", + "\n", + "retriever = vectorstore.as_retriever()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4efaddd9-3dbb-455c-ba54-0ad7f2d2ce0f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.pydantic_v1 import BaseModel\n", + "from langchain_core.runnables import RunnableParallel, RunnablePassthrough\n", + "\n", + "# RAG prompt\n", + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "# LLM\n", + "from langchain_community.llms import Together\n", + "\n", + "llm = Together(\n", + " model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", + " temperature=0.0,\n", + " max_tokens=2000,\n", + " top_k=1,\n", + ")\n", + "\n", + "# RAG chain\n", + "chain = (\n", + " RunnableParallel({\"context\": retriever, \"question\": RunnablePassthrough()})\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "88b1ee51-1b0f-4ebf-bb32-e50e843f0eeb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nAnswer: The architectural details of Mixtral are as follows:\\n- Dimension (dim): 4096\\n- Number of layers (n\\\\_layers): 32\\n- Dimension of each head (head\\\\_dim): 128\\n- Hidden dimension (hidden\\\\_dim): 14336\\n- Number of heads (n\\\\_heads): 32\\n- Number of kv heads (n\\\\_kv\\\\_heads): 8\\n- Context length (context\\\\_len): 32768\\n- Vocabulary size (vocab\\\\_size): 32000\\n- Number of experts (num\\\\_experts): 8\\n- Number of top k experts (top\\\\_k\\\\_experts): 2\\n\\nMixtral is based on a transformer architecture and uses the same modifications as described in [18], with the notable exceptions that Mixtral supports a fully dense context length of 32k tokens, and the feedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router network chooses two of these groups (the “experts”) to process the token and combine their output additively. This technique increases the number of parameters of a model while controlling cost and latency, as the model only uses a fraction of the total set of parameters per token. Mixtral is pretrained with multilingual data using a context size of 32k tokens. It either matches or exceeds the performance of Llama 2 70B and GPT-3.5, over several benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and multilingual benchmarks.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"What are the Architectural details of Mixtral?\")" + ] + }, + { + "cell_type": "markdown", + "id": "755cf871-26b7-4e30-8b91-9ffd698470f4", + "metadata": {}, + "source": [ + "Trace: \n", + "\n", + "https://smith.langchain.com/public/935fd642-06a6-4b42-98e3-6074f93115cd/r" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}