mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-16 09:48:04 +00:00
Fix WhatsAppChatLoader : Enable parsing additional formats (#6663)
- Description: Updated regex to support a new format that was observed when whatsapp chat was exported. - Issue: #6654 - Dependencies: No new dependencies - Tag maintainer: @rlancemartin, @eyurtsev
This commit is contained in:
parent
3e30a5d967
commit
afc292e58d
@ -40,7 +40,7 @@ class WhatsAppChatLoader(BaseLoader):
|
||||
(?:
|
||||
:\d{2}
|
||||
)?
|
||||
(?:[ _](?:AM|PM))?
|
||||
(?:[\s_](?:AM|PM))?
|
||||
)
|
||||
\]?
|
||||
[\s-]*
|
||||
@ -50,7 +50,9 @@ class WhatsAppChatLoader(BaseLoader):
|
||||
(.+)
|
||||
"""
|
||||
for line in lines:
|
||||
result = re.match(message_line_regex, line.strip(), flags=re.VERBOSE)
|
||||
result = re.match(
|
||||
message_line_regex, line.strip(), flags=re.VERBOSE | re.IGNORECASE
|
||||
)
|
||||
if result:
|
||||
date, sender, text = result.groups()
|
||||
text_content += concatenate_rows(date, sender, text)
|
||||
|
@ -18,4 +18,6 @@ def test_whatsapp_chat_loader() -> None:
|
||||
"User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n"
|
||||
"~ User name 2 on 1/24/21, 12:41:03 PM: Of course!\n\n"
|
||||
"~ User 2 on 2023/5/4, 16:13:23: See you!\n\n"
|
||||
"User 1 on 7/19/22, 11:32 PM: Hello\n\n"
|
||||
"User 2 on 7/20/22, 11:32 am: Goodbye\n\n"
|
||||
)
|
||||
|
@ -3,4 +3,6 @@
|
||||
1/23/23, 3:19 AM - User 2: Bye!
|
||||
1/23/23, 3:22_AM - User 1: And let me know if anything changes
|
||||
[1/24/21, 12:41:03 PM] ~ User name 2: Of course!
|
||||
[2023/5/4, 16:13:23] ~ User 2: See you!
|
||||
[2023/5/4, 16:13:23] ~ User 2: See you!
|
||||
7/19/22, 11:32 PM - User 1: Hello
|
||||
7/20/22, 11:32 am - User 2: Goodbye
|
||||
|
Loading…
Reference in New Issue
Block a user