diff --git a/libs/langchain/langchain/chat_loaders/imessage.py b/libs/langchain/langchain/chat_loaders/imessage.py index 093d9b15eda..53f32de92c2 100644 --- a/libs/langchain/langchain/chat_loaders/imessage.py +++ b/libs/langchain/langchain/chat_loaders/imessage.py @@ -46,6 +46,36 @@ class IMessageChatLoader(BaseChatLoader): "Please install it with `pip install pysqlite3`" ) from e + def _parse_attributedBody(self, attributedBody: bytes) -> str: + """ + Parse the attributedBody field of the message table + for the text content of the message. + + The attributedBody field is a binary blob that contains + the message content after the byte string b"NSString": + + 5 bytes 1-3 bytes `len` bytes + ... | b"NSString" | preamble | `len` | contents | ... + + The 5 preamble bytes are always b"\x01\x94\x84\x01+" + + The size of `len` is either 1 byte or 3 bytes: + - If the first byte in `len` is b"\x81" then `len` is 3 bytes long. + So the message length is the 2 bytes after, in little Endian. + - Otherwise, the size of `len` is 1 byte, and the message length is + that byte. + + Args: + attributedBody (bytes): attributedBody field of the message table. + Return: + str: Text content of the message. + """ + content = attributedBody.split(b"NSString")[1][5:] + length, start = content[0], 1 + if content[0] == 129: + length, start = int.from_bytes(content[1:3], "little"), 3 + return content[start : start + length].decode("utf-8", errors="ignore") + def _load_single_chat_session( self, cursor: "sqlite3.Cursor", chat_id: int ) -> ChatSession: @@ -62,7 +92,7 @@ class IMessageChatLoader(BaseChatLoader): results: List[HumanMessage] = [] query = """ - SELECT message.date, handle.id, message.text + SELECT message.date, handle.id, message.text, message.attributedBody FROM message JOIN chat_message_join ON message.ROWID = chat_message_join.message_id JOIN handle ON message.handle_id = handle.ROWID @@ -72,18 +102,24 @@ class IMessageChatLoader(BaseChatLoader): cursor.execute(query, (chat_id,)) messages = cursor.fetchall() - for date, sender, text in messages: - if text: # Skip empty messages - results.append( - HumanMessage( - role=sender, - content=text, - additional_kwargs={ - "message_time": date, - "sender": sender, - }, - ) + for date, sender, text, attributedBody in messages: + if text: + content = text + elif attributedBody: + content = self._parse_attributedBody(attributedBody) + else: # Skip messages with no content + continue + + results.append( + HumanMessage( + role=sender, + content=content, + additional_kwargs={ + "message_time": date, + "sender": sender, + }, ) + ) return ChatSession(messages=results) diff --git a/libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db b/libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db new file mode 100644 index 00000000000..cd8ab0800fe Binary files /dev/null and b/libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db differ diff --git a/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py b/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py new file mode 100644 index 00000000000..03a6c9f66a1 --- /dev/null +++ b/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py @@ -0,0 +1,28 @@ +import pathlib + +from langchain.chat_loaders import imessage, utils + + +def test_imessage_chat_loader() -> None: + chat_path = pathlib.Path(__file__).parent / "data" / "imessage_chat.db" + loader = imessage.IMessageChatLoader(str(chat_path)) + + chat_sessions = list( + utils.map_ai_messages(loader.lazy_load(), sender="testemail@gmail.com") + ) + assert chat_sessions, "Chat sessions should not be empty" + + assert chat_sessions[0]["messages"], "Chat messages should not be empty" + + # message content in text field + assert "Yeh" in chat_sessions[0]["messages"][0].content, "Chat content mismatch" + + # short message content in attributedBody field + assert ( + "John is the almighty" in chat_sessions[0]["messages"][16].content + ), "Chat content mismatch" + + # long message content in attributedBody field + long_msg = "aaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbba" + "aaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbb" + assert long_msg in chat_sessions[0]["messages"][18].content, "Chat content mismatch"