mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 14:05:16 +00:00
Update WhatsAppChatLoader regex to handle multiple date-time formats (#4186)
This PR updates the `message_line_regex` used by `WhatsAppChatLoader` to support different date-time formats used in WhatsApp chat exports; resolves #4153. The new regex handles the following input formats: ```terminal [05.05.23, 15:48:11] James: Hi here [11/8/21, 9:41:32 AM] User name: Message 123 1/23/23, 3:19 AM - User 2: Bye! 1/23/23, 3:22_AM - User 1: And let me know if anything changes ``` Tests have been added to verify that the loader works correctly with all formats.
This commit is contained in:
parent
a57259ec83
commit
2a3c5f8353
@ -26,16 +26,31 @@ class WhatsAppChatLoader(BaseLoader):
|
|||||||
with open(p, encoding="utf8") as f:
|
with open(p, encoding="utf8") as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
message_line_regex = (
|
message_line_regex = r"""
|
||||||
r"(\d{1,2}/\d{1,2}/\d{2,4}, "
|
\[?
|
||||||
r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - "
|
(
|
||||||
r"(.*?): (.*)"
|
\d{1,2}
|
||||||
|
[\/.]
|
||||||
|
\d{1,2}
|
||||||
|
[\/.]
|
||||||
|
\d{2,4}
|
||||||
|
,\s
|
||||||
|
\d{1,2}
|
||||||
|
:\d{2}
|
||||||
|
(?:
|
||||||
|
:\d{2}
|
||||||
|
)?
|
||||||
|
(?:[ _](?:AM|PM))?
|
||||||
)
|
)
|
||||||
|
\]?
|
||||||
|
[\s-]*
|
||||||
|
([\w\s]+)
|
||||||
|
[:]+
|
||||||
|
\s
|
||||||
|
(.+)
|
||||||
|
"""
|
||||||
for line in lines:
|
for line in lines:
|
||||||
result = re.match(
|
result = re.match(message_line_regex, line.strip(), flags=re.VERBOSE)
|
||||||
message_line_regex,
|
|
||||||
line.strip(),
|
|
||||||
)
|
|
||||||
if result:
|
if result:
|
||||||
date, sender, text = result.groups()
|
date, sender, text = result.groups()
|
||||||
text_content += concatenate_rows(date, sender, text)
|
text_content += concatenate_rows(date, sender, text)
|
||||||
|
@ -0,0 +1,19 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain.document_loaders import WhatsAppChatLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_whatsapp_chat_loader() -> None:
|
||||||
|
"""Test WhatsAppChatLoader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples" / "whatsapp_chat.txt"
|
||||||
|
loader = WhatsAppChatLoader(str(file_path))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].metadata["source"] == str(file_path)
|
||||||
|
assert docs[0].page_content == (
|
||||||
|
"James on 05.05.23, 15:48:11: Hi here\n\n"
|
||||||
|
"User name on 11/8/21, 9:41:32 AM: Message 123\n\n"
|
||||||
|
"User 2 on 1/23/23, 3:19 AM: Bye!\n\n"
|
||||||
|
"User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n"
|
||||||
|
)
|
4
tests/integration_tests/examples/whatsapp_chat.txt
Normal file
4
tests/integration_tests/examples/whatsapp_chat.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[05.05.23, 15:48:11] James: Hi here
|
||||||
|
[11/8/21, 9:41:32 AM] User name: Message 123
|
||||||
|
1/23/23, 3:19 AM - User 2: Bye!
|
||||||
|
1/23/23, 3:22_AM - User 1: And let me know if anything changes
|
Loading…
Reference in New Issue
Block a user