diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index d71280a73f4..9b139807a51 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -55,6 +55,36 @@ class FAISS(VectorStore): self.docstore = docstore self.index_to_docstore_id = index_to_docstore_id + def __add( + self, + texts: Iterable[str], + embeddings: Iterable[List[float]], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + documents = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + documents.append(Document(page_content=text, metadata=metadata)) + # Add to the index, the index_to_id mapping, and the docstore. + starting_len = len(self.index_to_docstore_id) + self.index.add(np.array(embeddings, dtype=np.float32)) + # Get list of index, id, and docs. + full_info = [ + (starting_len + i, str(uuid.uuid4()), doc) + for i, doc in enumerate(documents) + ] + # Add information to docstore and index. + self.docstore.add({_id: doc for _, _id, doc in full_info}) + index_to_id = {index: _id for index, _id, _ in full_info} + self.index_to_docstore_id.update(index_to_id) + return [_id for _, _id, _ in full_info] + def add_texts( self, texts: Iterable[str], @@ -77,23 +107,34 @@ class FAISS(VectorStore): ) # Embed and create the documents. embeddings = [self.embedding_function(text) for text in texts] - documents = [] - for i, text in enumerate(texts): - metadata = metadatas[i] if metadatas else {} - documents.append(Document(page_content=text, metadata=metadata)) - # Add to the index, the index_to_id mapping, and the docstore. - starting_len = len(self.index_to_docstore_id) - self.index.add(np.array(embeddings, dtype=np.float32)) - # Get list of index, id, and docs. - full_info = [ - (starting_len + i, str(uuid.uuid4()), doc) - for i, doc in enumerate(documents) - ] - # Add information to docstore and index. - self.docstore.add({_id: doc for _, _id, doc in full_info}) - index_to_id = {index: _id for index, _id, _ in full_info} - self.index_to_docstore_id.update(index_to_id) - return [_id for _, _id, _ in full_info] + return self.__add(texts, embeddings, metadatas, **kwargs) + + def add_embeddings( + self, + text_embeddings: Iterable[Tuple[str, List[float]]], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + text_embeddings: Iterable pairs of string and embedding to + add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + # Embed and create the documents. + + texts = [te[0] for te in text_embeddings] + embeddings = [te[1] for te in text_embeddings] + return self.__add(texts, embeddings, metadatas, **kwargs) def similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4 @@ -253,6 +294,28 @@ class FAISS(VectorStore): index_to_id = {index: _id for index, _id, _ in full_info} self.index_to_docstore_id.update(index_to_id) + @classmethod + def __from( + cls, + texts: List[str], + embeddings: List[List[float]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> FAISS: + faiss = dependable_faiss_import() + index = faiss.IndexFlatL2(len(embeddings[0])) + index.add(np.array(embeddings, dtype=np.float32)) + documents = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + documents.append(Document(page_content=text, metadata=metadata)) + index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} + docstore = InMemoryDocstore( + {index_to_id[i]: doc for i, doc in enumerate(documents)} + ) + return cls(embedding.embed_query, index, docstore, index_to_id) + @classmethod def from_texts( cls, @@ -278,19 +341,37 @@ class FAISS(VectorStore): embeddings = OpenAIEmbeddings() faiss = FAISS.from_texts(texts, embeddings) """ - faiss = dependable_faiss_import() embeddings = embedding.embed_documents(texts) - index = faiss.IndexFlatL2(len(embeddings[0])) - index.add(np.array(embeddings, dtype=np.float32)) - documents = [] - for i, text in enumerate(texts): - metadata = metadatas[i] if metadatas else {} - documents.append(Document(page_content=text, metadata=metadata)) - index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} - docstore = InMemoryDocstore( - {index_to_id[i]: doc for i, doc in enumerate(documents)} - ) - return cls(embedding.embed_query, index, docstore, index_to_id) + return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) + + @classmethod + def from_embeddings( + cls, + text_embeddings: List[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the FAISS database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import FAISS + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + faiss = FAISS.from_texts(texts, embeddings) + """ + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) def save_local(self, folder_path: str) -> None: """Save FAISS index, docstore, and index_to_docstore_id to disk.