notebook and other cleanup pre-publish

This commit is contained in:
Taqi Jaffri
2023-05-11 01:26:51 -07:00
parent 01de9f74bf
commit ef7cb286e9
3 changed files with 287 additions and 182 deletions

View File

@@ -56,46 +56,34 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='MUTUAL NON-DISCLOSURE AGREEMENT', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MutualNon-disclosure', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'MutualNon-disclosure', 'projects': []}),\n",
" Document(page_content='This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement', 'projects': []}),\n",
" Document(page_content='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'Discussions', 'projects': []}),\n",
" Document(page_content='In consideration of the foregoing, the parties agree as follows:1.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'Consideration', 'projects': []}),\n",
" Document(page_content='Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[1]', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'ConfidentialInformation', 'projects': []}),\n",
" Document(page_content='For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.2.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'ConfidentialInformation', 'projects': []}),\n",
" Document(page_content='Obligations and Restrictions .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:Obligations', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'Obligations', 'projects': []}),\n",
" Document(page_content=\"Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other partys Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other partys Confidential Information as those set forth in this Agreement .3.\", metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'ObligationsAndRestrictions', 'projects': []}),\n",
" Document(page_content='Exceptions.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[1]', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'Exceptions', 'projects': []}),\n",
" Document(page_content='The obligations and restrictions in Section 2 will not apply to any information or materials that:(i)', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'Exceptions', 'projects': []}),\n",
" Document(page_content='were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;(ii)', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'TheDate', 'projects': []}),\n",
" Document(page_content='were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;(iii)', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'TheReceivingParty', 'projects': []}),\n",
" Document(page_content='are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;4.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'TheReceivingParty', 'projects': []}),\n",
" Document(page_content='Compelled Disclosure .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='Nothing in this Agreement will be deemed to restrict a party from disclosing the other partys Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.5.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'CompelledDisclosure', 'projects': []}),\n",
" Document(page_content='Return of Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='Upon the completion or abandonment of the Purpose, and in any event upon the disclosing partys request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing partys Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing partys Confidential Information .6.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation', 'projects': []}),\n",
" Document(page_content='No Obligations .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[1]', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'NoObligations', 'projects': []}),\n",
" Document(page_content='Each party retains the right to determine whether to disclose any Confidential Information to the other party.7.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'NoObligations', 'projects': []}),\n",
" Document(page_content='No Warranty.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[1]', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'NoWarranty', 'projects': []}),\n",
" Document(page_content='ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.8.Term.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'NoWarranty', 'projects': []}),\n",
" Document(page_content='This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.9.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'Term', 'projects': []}),\n",
" Document(page_content='Equitable Relief .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[1]', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'EquitableRelief', 'projects': []}),\n",
" Document(page_content='Each party acknowledges that the unauthorized use or disclosure of the disclosing partys Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.10.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'EquitableRelief', 'projects': []}),\n",
" Document(page_content='Non-compete.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .11.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'Non-compete', 'projects': []}),\n",
" Document(page_content='Miscellaneous.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other partys prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'documentId': '43rj0ds7s0ur', 'structure': 'div', 'tag': 'Miscellaneous', 'projects': []}),\n",
" Document(page_content='[SIGNATURE PAGE FOLLOWS]', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='IN WITNESS WHEREOF,', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/dg:chunk', 'documentId': '43rj0ds7s0ur', 'structure': 'h1', 'tag': 'chunk', 'projects': []}),\n",
" Document(page_content='the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'documentId': '43rj0ds7s0ur', 'structure': 'p', 'tag': 'TheParties', 'projects': []})]"
"[Document(page_content='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}),\n",
" Document(page_content='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Discussions'}),\n",
" Document(page_content='In consideration of the foregoing, the parties agree as follows:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Consideration'}),\n",
" Document(page_content='1. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ConfidentialInformation'}),\n",
" Document(page_content=\"2. Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other partys Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other partys Confidential Information as those set forth in this Agreement .\", metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ObligationsAndRestrictions'}),\n",
" Document(page_content='3. Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Exceptions'}),\n",
" Document(page_content='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheDate'}),\n",
" Document(page_content='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
" Document(page_content='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n",
" Document(page_content='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other partys Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'CompelledDisclosure'}),\n",
" Document(page_content='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing partys request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing partys Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing partys Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}),\n",
" Document(page_content='6. No Obligations . Each party retains the right to determine whether to disclose any Confidential Information to the other party.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoObligations'}),\n",
" Document(page_content='7. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoWarranty'}),\n",
" Document(page_content='8. Term. This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Term'}),\n",
" Document(page_content='9. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing partys Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'EquitableRelief'}),\n",
" Document(page_content='10. Non-compete. To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Non-compete'}),\n",
" Document(page_content='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other partys prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Miscellaneous'}),\n",
" Document(page_content='[SIGNATURE PAGE FOLLOWS] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheParties'}),\n",
" Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'documentId': '43rj0ds7s0ur', 'documentName': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]"
]
},
"execution_count": 18,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -110,16 +98,15 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The `metadata` for each `Document` (really, a chunk of an actual PDF, DOC or DOCX) contains some useful additional information:\n",
"\n",
"1. documentId: ID of the actual PDF, DOC or DOCX the chunk is sourced from within Docugami.\n",
"2. xpath: XPath inside the XML representation of the document, for the chunk. Useful for source citations directly to the actual chunk inside the document XML.\n",
"3. structure: Structural attributes of the chunk, e.g. h1, h2, div, table, td, etc. Useful to filter out certain kinds of chunks if needed by the caller.\n",
"4. tag: Semantic tag for the chunk, using various generative and extractive techniques. More details here: https://github.com/docugami/DFM-benchmarks"
"1. **documentId and documentName:** ID and Name of the file (PDF, DOC or DOCX) the chunk is sourced from within Docugami.\n",
"2. **xpath:** XPath inside the XML representation of the document, for the chunk. Useful for source citations directly to the actual chunk inside the document XML.\n",
"3. **structure:** Structural attributes of the chunk, e.g. h1, h2, div, table, td, etc. Useful to filter out certain kinds of chunks if needed by the caller.\n",
"4. **tag:** Semantic tag for the chunk, using various generative and extractive techniques. More details here: https://github.com/docugami/DFM-benchmarks"
]
},
{
@@ -143,7 +130,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -155,30 +142,35 @@
"\n",
"# In this example, we already have a docset with a set of lease documents\n",
"loader = DocugamiLoader(docset_id=\"wh2kned25uqm\")\n",
"documents = loader.load()\n",
"documents[:5] # dump a few docs to output"
"documents = loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The documents returned by the loader are already split, so we don't need to use a text splitter. Optionally, we can use the metadata on each document, for example the structure or tag attributes, to do any post-processing we want.\n",
"\n",
"We will just use the output of the DocugamiLoader as-is and set up embeddings and a retrieval QA chain the usual way."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using embedded DuckDB without persistence: data will be transient\n"
]
}
],
"source": [
"# The documents returned by the loader are already split, but we can filter out\n",
"# some chunks e.g. tables in this example\n",
"texts = [Document(page_content=d.page_content) for d in documents if d.metadata[\"structure\"] not in [\"table\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set up embeddings and retrieval QA the usual way\n",
"embedding = OpenAIEmbeddings()\n",
"vectordb = Chroma.from_documents(documents=texts, embedding=embedding)\n",
"vectordb = Chroma.from_documents(documents=documents, embedding=embedding)\n",
"retriever = vectordb.as_retriever()\n",
"qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), \n",
" chain_type=\"stuff\", \n",
@@ -186,28 +178,30 @@
" return_source_documents=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'query': 'What can tenants do with signage on the properties?',\n",
" 'result': ' Tenants can place or attach signs (digital or otherwise) to the Premises after receiving written permission from the Landlord, which permission shall not be unreasonably withheld. Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.',\n",
" 'source_documents': [Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk', 'documentId': 'g2fvhekmltza', 'documentName': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk'}),\n",
" Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage', 'documentId': 'v1bvgaozfkak', 'documentName': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage'}),\n",
" Document(page_content='Landlord , its agents, servants, employees, licensees, invitees, and contractors during the last year of the term of this Lease at any and all times during regular business hours, after 24 hour notice to tenant, to pass and repass on and through the Premises, or such portion thereof as may be necessary, in order that they or any of them may gain access to the Premises for the purpose of showing the Premises to potential new tenants or real estate brokers. In addition, Landlord shall be entitled to place a \"FOR RENT \" or \"FOR LEASE\" sign (not exceeding 8.5 ” x 11 ”) in the front window of the Premises during the last six months of the term of this Lease .', metadata={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42FLandlordSAccess-section/docset:_42FLandlordSAccess/docset:LandlordsRights/docset:Landlord', 'documentId': 'omvs4mysdk6b', 'documentName': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'Landlord'}),\n",
" Document(page_content=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:ThisLease-section/docset:ThisLease/docset:Guaranty-section/docset:Guaranty[2]/docset:TheTransfer/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS', 'documentId': 'dsyfhh4vpeyf', 'documentName': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'SIGNS'})]}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Try out the retriever with an example query\n",
"qa_chain(\"Are tenants allowed to place signs on their property?\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice in the results (specifically the source_documents) that some tenants are actually allowed to place signs, while others are not. The LLM used all that context and decided to answer in the negative even though that is not completely correct."
"qa_chain(\"What can tenants do with signage on the properties?\")\n"
]
},
{
@@ -219,16 +213,28 @@
"\n",
"One issue with large documents is that the correct answer to your question may depend on chunks that are far apart in the document. Typical chunking techniques, even with overlap, will struggle with providing the LLM sufficent context to answer such questions. With upcoming very large context LLMs, it may be possible to stuff a lot of tokens, perhaps even entire documents, inside the context but this will still hit limits at some point with very long documents, or a lot of documents.\n",
"\n",
"For example, if we ask a more complex question that requires the LLM to draw on chunks from different parts of the document, even OpenAI's powerful LLM is unable to answer correctly. In the following example, the chunking of the document does not end up putting the Landlord name and the address in the same context, and instead ends up finding a few other unrelated chunks from other documents not even related to the Menlo Group landlord."
"For example, if we ask a more complex question that requires the LLM to draw on chunks from different parts of the document, even OpenAI's powerful LLM is unable to answer correctly."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"' 9,753 square feet.'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qa_chain(\"What is the address of the property leased out by Menlo Group?\")"
"chain_response = qa_chain(\"What is rentable area for for the property owned by DHA Group?\")\n",
"chain_response[\"result\"]"
]
},
{
@@ -236,27 +242,94 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Docugami offers a better approach. Chunks are annotated with additional metadata created using different techniques if a user has been [using Docugami](https://help.docugami.com/home/reports). More technical approaches will be added later.\n",
"\n",
"For example, we can build the texts again, but this time read the metadata on the documents returned by docugami, and put some simple key/value pairs on all the text chunks based on the metadata:"
"At first glance the answer may seem reasonable, but if you review the source documents carefully for this answer, you will see that the chunking of the document did not end up putting the Landlord name and the rentable area in the same context, since they are far apart in the document. The retriever therefore ends up finding unrelated chunks from other documents not even related to the Menlo Group landlord. That landlord happens to be mentioned on the first page of the file **Shorebucks LLC_NJ.pdf** file, and while one of the source documents used by the chain is indeed from that doc and does contain the correct answer (13,500), the other ones source chunks are from other docs, and the answer is therefore incorrect. The language model seems to have picked up the square footage from one of the other unrelated source chunks."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='1.6 Rentable Area of the Premises. 16,159 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:MenloGroup/docset:MenloGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'documentId': 'qkn9cyqsiuch', 'documentName': 'Shorebucks LLC_AZ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'documentId': 'md8rieecquyv', 'documentName': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'documentId': 'dsyfhh4vpeyf', 'documentName': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises'}),\n",
" Document(page_content='$ 9,998.00 25.5 % ( Premises Footage divided by total Building Rentable Area )', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEIBASICLEASETERMS-section/docset:ARTICLEIBASICLEASETERMS/docset:SecurityDepositTenantsShareApprovedUse-section/docset:SecurityDepositTenantsShareApprovedUse/docset:PremisesFootage[2]/docset:PremisesFootage', 'documentId': 'v1bvgaozfkak', 'documentName': 'TruTone Lane 2.docx', 'structure': 'p', 'tag': 'PremisesFootage'})]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"texts = []\n",
"for d in documents:\n",
" if d.metadata[\"structure\"] not in [\"table\"]: # for parity with the example above, continue to filter out tables\n",
" metadata = {\"tag\": d.metadata[\"tag\"]}\n",
" for project in d.metadata[\"projects\"]:\n",
" for entry in project[\"entries\"]:\n",
" metadata[entry[\"heading\"]] = entry[\"value\"]\n",
" texts.append(Document(page_content=d.page_content, metadata=metadata))\n",
"chain_response[\"source_documents\"]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Docugami can help here. Chunks are annotated with additional metadata created using different techniques if a user has been [using Docugami](https://help.docugami.com/home/reports). More technical approaches will be added later.\n",
"\n",
"vectordb = Chroma.from_documents(documents=texts, embedding=embedding)\n",
"For example, we can build the texts again after using Docugami for a while, and this time some additional metadata is returned on the documents returned by docugami, in the form of some simple key/value pairs on all the text chunks:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOfficeLeaseAgreement',\n",
" 'documentId': 'v1bvgaozfkak',\n",
" 'documentName': 'TruTone Lane 2.docx',\n",
" 'structure': 'p',\n",
" 'tag': 'ThisOfficeLeaseAgreement',\n",
" 'Landlord': 'BUBBA CENTER PARTNERSHIP',\n",
" 'Tenant': 'Truetone Lane LLC',\n",
" 'Rentable Area': '17,044 square feet'}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loader = DocugamiLoader(docset_id=\"wh2kned25uqm\")\n",
"documents = loader.load()\n",
"documents[0].metadata"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"We can rebuild our retrieval QA chain with these richer chunks, and this time just pick the top result from the retriever which we expect to be more accurate since it includes the vector metadata."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using embedded DuckDB without persistence: data will be transient\n"
]
}
],
"source": [
"vectordb = Chroma.from_documents(documents=documents, embedding=embedding)\n",
"retriever = vectordb.as_retriever()\n",
"qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), \n",
" chain_type=\"stuff\", \n",
@@ -269,16 +342,33 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's run the same question again. It returns better results since all the chunks have metadata key/value pairs on them carrying key information about the document even if it is physically very far away."
"Let's run the same question again. It returns the correct result since all the chunks have metadata key/value pairs on them carrying key information about the document even if this infromation is physically very far away from the source chunk used to generate the answer."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'query': \"What is the rentable area of Perry & Blair's property?\",\n",
" 'result': ' 16,159 square feet.',\n",
" 'source_documents': [Document(page_content='1.1 Landlord . Perry & Blair LLC', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:PerryBlair/docset:Landlord-section/docset:Landlord', 'documentId': 'dsyfhh4vpeyf', 'documentName': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'Landlord', 'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'Rentable Area': '9,753 square feet'}),\n",
" Document(page_content=\"1.16 Landlord 's Notice Address . Perry & Blair LLC , Suite 210 , 515 Michelin Rd , Denver , CO , 80209 .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:First/docset:ApplicableSalesTax/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]', 'documentId': 'dsyfhh4vpeyf', 'documentName': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC', 'Rentable Area': '9,753 square feet'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'documentId': 'md8rieecquyv', 'documentName': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC', 'Rentable Area': '13,500 square feet'}),\n",
" Document(page_content='1.6 Rentable Area of the Premises. 16,159 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:MenloGroup/docset:MenloGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'documentId': 'qkn9cyqsiuch', 'documentName': 'Shorebucks LLC_AZ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'Menlo Group', 'Tenant': 'Shorebucks LLC', 'Rentable Area': '16,159 square feet'})]}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qa_chain(\"How much prepaid rent does Menlo Group require at signing?\")"
"chain_response = qa_chain(\"What is the rentable area of Perry & Blair's property?\")\n",
"chain_response"
]
},
{

View File

@@ -17,7 +17,8 @@ TD_NAME = "{http://www.w3.org/1999/xhtml}td"
TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
XPATH_KEY = "xpath"
DOCUMENT_ID_KEY = "documentId"
DOCUMENT_ID_KEY = "id"
DOCUMENT_NAME_KEY = "name"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
PROJECTS_KEY = "projects"
@@ -39,8 +40,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
docset_id: Optional[str]
document_ids: Optional[List[str]]
file_paths: Optional[List[Path]]
max_chunk_size: int = 1024
min_chunk_size: int = 10
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
@root_validator
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
@@ -56,9 +56,18 @@ class DocugamiLoader(BaseLoader, BaseModel):
return values
def _xpath_for_chunk(self, chunk: Any) -> str:
ancestor_chain = chunk.xpath("ancestor-or-self::*")
def _parse_dgml(
self, document: Dict, content: bytes, doc_metadata: Optional[Dict] = None
) -> List[Document]:
try:
from lxml import etree
except ImportError:
raise ValueError(
"Could not import lxml python package. "
"Please install it with `pip install lxml`."
)
# helpers
def _xpath_qname_for_chunk(chunk: Any) -> str:
qname = f"{chunk.prefix}:{chunk.tag.split('}')[-1]}"
@@ -71,30 +80,38 @@ class DocugamiLoader(BaseLoader, BaseModel):
return qname
return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain)
def _xpath_for_chunk(chunk: Any) -> str:
ancestor_chain = chunk.xpath("ancestor-or-self::*")
return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain)
def _parse_dgml(
self, document_id: str, content: bytes, project_metadata: Optional[List] = None
) -> List[Document]:
try:
from lxml import etree
except ImportError:
raise ValueError(
"Could not import lxml python package. "
"Please install it with `pip install lxml`."
def _structure_value(node: Any) -> str:
structure = (
"table"
if node.tag == TABLE_NAME
else node.attrib["structure"]
if "structure" in node.attrib
else None
)
return structure
# helper functions
def _is_structural(node):
return "structure" in node.attrib
def _is_structural(node: Any) -> bool:
return _structure_value(node) is not None
def _get_text(node):
def _is_heading(node: Any) -> bool:
structure = _structure_value(node)
return structure is not None and structure.lower().startswith("h")
def _get_text(node: Any) -> str:
return " ".join(node.itertext()).strip()
def _leaf_structural_nodes(node):
if _is_structural(node) and not any(
_is_structural(child) for child in node
):
def _has_structural_descendant(node: Any) -> bool:
for child in node:
if _is_structural(child) or _has_structural_descendant(child):
return True
return False
def _leaf_structural_nodes(node: Any) -> List:
if _is_structural(node) and not _has_structural_descendant(node):
return [node]
else:
leaf_nodes = []
@@ -102,47 +119,49 @@ class DocugamiLoader(BaseLoader, BaseModel):
leaf_nodes.extend(_leaf_structural_nodes(child))
return leaf_nodes
# combine metadata across projects for this document
doc_metadata = []
if project_metadata:
for metadata in project_metadata:
project_doc_metadata = metadata.get(document_id)
if project_doc_metadata:
doc_metadata.append(project_doc_metadata)
def _create_doc(node: Any, text: str) -> Document:
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document["id"],
DOCUMENT_NAME_KEY: document["name"],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
}
if doc_metadata:
metadata.update(doc_metadata)
return Document(
page_content=text,
metadata=metadata,
)
# parse the tree and return chunks
tree = etree.parse(io.BytesIO(content))
root = tree.getroot()
chunks: List[Document] = []
prev_small_chunk_text = None
for node in _leaf_structural_nodes(root):
text = _get_text(node)
if (
len(text) < self.min_chunk_size
and len(chunks) > 0
and len(chunks[-1].page_content + text) < self.max_chunk_size
):
# node is leaf structural, but too small, so append to previous chunk
# as long as this will not push previous chunk itself past max size.
chunks[-1].page_content += " " + text
if prev_small_chunk_text:
text = prev_small_chunk_text + " " + text
prev_small_chunk_text = None
if _is_heading(node) or len(text) < self.min_chunk_size:
# Save headings or other small chunks to be appended to the next chunk
prev_small_chunk_text = text
else:
chunks.append(
Document(
page_content=text,
metadata={
XPATH_KEY: self._xpath_for_chunk(node),
DOCUMENT_ID_KEY: document_id,
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
PROJECTS_KEY: doc_metadata,
},
)
)
chunks.append(_create_doc(node, text))
if prev_small_chunk_text and len(chunks) > 0:
# small chunk at the end left over, just append to last chunk
chunks[-1].page_content += " " + prev_small_chunk_text
return chunks
def _document_ids_for_docset_id(self, docset_id: str) -> List[str]:
"""Gets all document IDs for the given docset ID"""
def _document_details_for_docset_id(self, docset_id: str) -> List[Dict]:
"""Gets all document details for the given docset ID"""
url = f"{self.api}/docsets/{docset_id}/documents"
response = requests.request(
"GET",
@@ -152,7 +171,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
)
if response.ok:
# TODO: pagination
return [doc["id"] for doc in response.json()["documents"]]
return response.json()["documents"]
else:
raise Exception(
f"Failed to download {url} (status: {response.status_code})"
@@ -176,10 +195,8 @@ class DocugamiLoader(BaseLoader, BaseModel):
)
def _metadata_for_project(self, project: Dict) -> Dict:
"""Gets project metadata"""
"""Gets project metadata for all files"""
project_id = project.get("id")
project_name = project.get("name")
project_type = project.get("type")
per_file_metadata = {}
url = f"{self.api}/projects/{project_id}/artifacts/latest"
@@ -203,11 +220,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
and artifact_doc
):
doc_id = artifact_doc["id"]
metadata: Dict = {
"projectType": project_type,
"projectTitle": project_name,
"entries": [],
}
metadata: Dict = {}
# the evaluated XML for each document is named after the project
response = requests.request(
@@ -231,17 +244,10 @@ class DocugamiLoader(BaseLoader, BaseModel):
entries = artifact_root.xpath("//wp:Entry", namespaces=ns)
for entry in entries:
heading = entry.xpath("./wp:Heading", namespaces=ns)[0].text
xpath = entry.xpath("./wp:XPath", namespaces=ns)[0].text
value = " ".join(
entry.xpath("./wp:Value", namespaces=ns)[0].itertext()
).strip()
metadata["entries"].append( # pylint: disable
{
"heading": heading,
"xpath": xpath,
"value": value,
}
)
metadata[heading] = value
per_file_metadata[doc_id] = metadata
else:
raise Exception(
@@ -255,9 +261,10 @@ class DocugamiLoader(BaseLoader, BaseModel):
f"Failed to download {url} (status: {response.status_code})"
)
def _load_chunks_for_document_id(
self, docset_id: str, document_id: str, project_metadata: Optional[List] = None
def _load_chunks_for_document(
self, docset_id: str, document: Dict, doc_metadata: Optional[Dict] = None
) -> List[Document]:
document_id = document["id"]
url = f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml"
response = requests.request(
@@ -268,7 +275,7 @@ class DocugamiLoader(BaseLoader, BaseModel):
)
if response.ok:
return self._parse_dgml(document_id, response.content, project_metadata)
return self._parse_dgml(document, response.content, doc_metadata)
else:
raise Exception(
f"Failed to download {url} (status: {response.status_code})"
@@ -280,27 +287,35 @@ class DocugamiLoader(BaseLoader, BaseModel):
if self.access_token and self.docset_id:
# remote mode
_document_ids = self.document_ids
if not _document_ids:
# no document IDs specified, default to all docs in docset
_document_ids = self._document_ids_for_docset_id(self.docset_id)
_document_details = self._document_details_for_docset_id(self.docset_id)
if self.document_ids:
_document_details = [
d for d in _document_details if d["id"] in self.document_ids
]
_project_details = self._project_details_for_docset_id(self.docset_id)
project_metadatas = []
combined_project_metadata = {}
if _project_details:
# if there are any projects for this docset, load project metadata
for project in _project_details:
metadata = self._metadata_for_project(project)
project_metadatas.append(metadata)
combined_project_metadata.update(metadata)
for doc_id in _document_ids:
chunks += self._load_chunks_for_document_id(
self.docset_id, doc_id, project_metadatas
for doc in _document_details:
doc_metadata = combined_project_metadata.get(doc["id"])
chunks += self._load_chunks_for_document(
self.docset_id, doc, doc_metadata
)
elif self.file_paths:
# local mode (for integration testing, or pre-downloaded XML)
for path in self.file_paths:
with open(path, "rb") as file:
chunks += self._parse_dgml(path.name, file.read())
chunks += self._parse_dgml(
{
DOCUMENT_ID_KEY: path.name,
DOCUMENT_NAME_KEY: path.name,
},
file.read(),
)
return chunks

View File

@@ -9,13 +9,13 @@ def test_docugami_loader_local() -> None:
loader = DocugamiLoader(file_paths=[file_path])
docs = loader.load()
assert len(docs) == 45
assert len(docs) == 19
xpath = docs[0].metadata.get("xpath")
assert str(xpath).endswith("/docset:MutualNon-disclosure")
assert docs[0].metadata["structure"] == "h1"
assert docs[0].metadata["tag"] == "MutualNon-disclosure"
assert docs[0].page_content == "MUTUAL NON-DISCLOSURE AGREEMENT"
assert str(xpath).endswith("/docset:Preamble")
assert docs[0].metadata["structure"] == "p"
assert docs[0].metadata["tag"] == "Preamble"
assert docs[0].page_content.startswith("MUTUAL NON-DISCLOSURE AGREEMENT")
def test_docugami_loader_remote_init() -> None: