mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-28 02:29:17 +00:00
New line should be remove only for the 1st gen embedding models (#3853)
Only 1st generation OpenAI embeddings models are negatively impacted by new lines. Context: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
This commit is contained in:
parent
6bd367916c
commit
a5a4999fb7
@ -158,8 +158,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
indices = []
|
||||
encoding = tiktoken.model.encoding_for_model(self.model)
|
||||
for i, text in enumerate(texts):
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
token = encoding.encode(
|
||||
text,
|
||||
allowed_special=self.allowed_special,
|
||||
@ -212,8 +214,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
if len(text) > self.embedding_ctx_length:
|
||||
return self._get_len_safe_embeddings([text], engine=engine)[0]
|
||||
else:
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
return embed_with_retry(self, input=[text], engine=engine)["data"][0][
|
||||
"embedding"
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user