From 79ae6c2a9ebb815e56da1410bf621aecc3ee51a4 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Thu, 14 Dec 2023 09:21:45 -0800 Subject: [PATCH] Add dense proposals (#14719) Indexing strategy based on decomposing candidate propositions while indexing. --- .../rag-chroma-dense-retrieval/.gitignore | 3 + templates/rag-chroma-dense-retrieval/LICENSE | 21 + .../rag-chroma-dense-retrieval/README.md | 81 + .../_images/retriever_diagram.png | Bin 0 -> 383829 bytes .../rag-chroma-dense-retrieval/poetry.lock | 2859 +++++++++++++++++ .../rag-chroma-dense-retrieval/pyproject.toml | 35 + .../rag_chroma_dense_retrieval.ipynb | 68 + .../rag_chroma_dense_retrieval/__init__.py | 4 + .../rag_chroma_dense_retrieval/chain.py | 67 + .../rag_chroma_dense_retrieval/constants.py | 1 + .../rag_chroma_dense_retrieval/ingest.py | 87 + .../proposal_chain.py | 107 + .../rag_chroma_dense_retrieval/storage.py | 38 + .../tests/__init__.py | 0 14 files changed, 3371 insertions(+) create mode 100644 templates/rag-chroma-dense-retrieval/.gitignore create mode 100644 templates/rag-chroma-dense-retrieval/LICENSE create mode 100644 templates/rag-chroma-dense-retrieval/README.md create mode 100644 templates/rag-chroma-dense-retrieval/_images/retriever_diagram.png create mode 100644 templates/rag-chroma-dense-retrieval/poetry.lock create mode 100644 templates/rag-chroma-dense-retrieval/pyproject.toml create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval.ipynb create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/__init__.py create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/chain.py create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/constants.py create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/ingest.py create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/proposal_chain.py create mode 100644 templates/rag-chroma-dense-retrieval/rag_chroma_dense_retrieval/storage.py create mode 100644 templates/rag-chroma-dense-retrieval/tests/__init__.py diff --git a/templates/rag-chroma-dense-retrieval/.gitignore b/templates/rag-chroma-dense-retrieval/.gitignore new file mode 100644 index 00000000000..86997822350 --- /dev/null +++ b/templates/rag-chroma-dense-retrieval/.gitignore @@ -0,0 +1,3 @@ +docs/img_*.jpg +chroma_db_proposals +multi_vector_retriever_metadata diff --git a/templates/rag-chroma-dense-retrieval/LICENSE b/templates/rag-chroma-dense-retrieval/LICENSE new file mode 100644 index 00000000000..426b6509034 --- /dev/null +++ b/templates/rag-chroma-dense-retrieval/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 LangChain, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/templates/rag-chroma-dense-retrieval/README.md b/templates/rag-chroma-dense-retrieval/README.md new file mode 100644 index 00000000000..986d7d782b3 --- /dev/null +++ b/templates/rag-chroma-dense-retrieval/README.md @@ -0,0 +1,81 @@ +# rag-chroma-dense-retrieval + +This template demonstrates the multi-vector indexing strategy proposed by Chen, et. al.'s [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/abs/2312.06648). The prompt, which you can [try out on the hub](https://smith.langchain.com/hub/wfh/proposal-indexing), directs an LLM to generate de-contextualized "propositions" which can be vectorized to increase the retrieval accuracy. You can see the full definition in `proposal_chain.py`. + +![Retriever Diagram](./_images/retriever_diagram.png) + +## Storage + +For this demo, we index a simple academic paper using the RecursiveUrlLoader, and store all retriever information locally (using chroma and a bytestore stored on the local filesystem). You can modify the storage layer in `storage.py`. + +## Environment Setup + +Set the `OPENAI_API_KEY` environment variable to access `gpt-3.5` and the OpenAI Embeddings classes. + +## Indexing + +Create the index by running the following: + +```python +poetry install +poetry run python rag_chroma_dense_retrieval/ingest.py +``` + +## Usage + +To use this package, you should first have the LangChain CLI installed: + +```shell +pip install -U langchain-cli +``` + +To create a new LangChain project and install this as the only package, you can do: + +```shell +langchain app new my-app --package rag-chroma-dense-retrieval +``` + +If you want to add this to an existing project, you can just run: + +```shell +langchain app add rag-chroma-dense-retrieval +``` + +And add the following code to your `server.py` file: + +```python +from rag_chroma_dense_retrieval import chain + +add_routes(app, chain, path="/rag-chroma-dense-retrieval") +``` + +(Optional) Let's now configure LangSmith. +LangSmith will help us trace, monitor and debug LangChain applications. +LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). +If you don't have access, you can skip this section + +```shell +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY= +export LANGCHAIN_PROJECT= # if not specified, defaults to "default" +``` + +If you are inside this directory, then you can spin up a LangServe instance directly by: + +```shell +langchain serve +``` + +This will start the FastAPI app with a server is running locally at +[http://localhost:8000](http://localhost:8000) + +We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) +We can access the playground at [http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground](http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground) + +We can access the template from code with: + +```python +from langserve.client import RemoteRunnable + +runnable = RemoteRunnable("http://localhost:8000/rag-chroma-dense-retrieval") +``` diff --git a/templates/rag-chroma-dense-retrieval/_images/retriever_diagram.png b/templates/rag-chroma-dense-retrieval/_images/retriever_diagram.png new file mode 100644 index 0000000000000000000000000000000000000000..a254a448778148cbdcc9057f3f062d0efa7470bd GIT binary patch literal 383829 zcmd43bzEG_)+pFWaCZsr?(Po3-Q7Jn!CixU2u^T!cL^3865PFmyG?V>$-VcznfczG z|EBrTyL#`cwW?OFTHYP0q#%U|hX)4$01#!Q#Z>?RXfXf)!Vv}vbf?lyLH1bWQB+yzzF!oso=GPJIZ7rp`t0&npfx9 zr>@W4DcamhY+>9EvAhmf@`!m|4-8OTVM-mI!Sd4T?6Wn>B`(u%pU=!KaG(wnIfk4G z0iR~~B10n_-RxKQeFH!HUCjuU;bOs;+kE|SR+**T9Zr7bpfesNpkDTfZ3FarO2!UP z6?|Fn5I3Z?$1OZfj_Peelm;_#8?y-VhzRFlkno}Ap4~$4^@?W7lryoS1BI64Yexaz|Mnvu10R;}zuz{4{`4yf20Zw?~b6ai%zFS5FlMj?>gz8;MX? zQ_3fYB!*nd}M ztx#IZ5uq(vNrP4_Kp-ymlJ}h_a)9KB)7oh4Y{}FU?MHJzaI~N1+VSI2%OHxIm}|*c z$jbw0L2VcSI2alL0@MNn{Q<(Q0g!*S0RT$S6|_l-VE|~*6%F*SlnegnDYRHF#Gf{V z<6B1&RWTVE&{fsc+1%X0#mdnY^RDI>001s%t)}IwB`?Qo>S)hsZ02ZU&gg0X@oflz z-;)>Av^RG(Cib+qb8zAH6d?Vp2QR4ocAJTm_^&RmwgRMD@=C;Fj?U)9oQ%wj%%p;F z#Kgq>&Sn<8D&ms=3Zy@xS2wk4OJ8^lc5i ziq6&`bjEK)6lCFN`oG=#r$0Z_8-@Ra;(u@FUw1+KDG0~U^e^5Jgaa6j^Z)?D02y%+ zHBYb;edu(I!G%t{01A59AKG>4E;o7Um>BOcQJ62r@3{oF*#sd>e&kZ$DPg#X=770q z#D68X#B%>NZdjc91$da1@jK%%1DGLjyS(inoE>VLE9zj3f8PyQdM^Pfbtjs%QMHB@L|Mg4b_eH)vY=L7YB+JS%L zkd}p%4l6QmtW)~$NgI!_%l3cX~W*jt(s*CT5-Q<4t@dm+gYw(L$vz za3u^8zYz4&Y2hGVX2fc^DKSIA*f?)z ze_v``@%iDJf{BUA`RUkmGTBqlLRH#rywcrY%ZJDG-t<8X+cLw1>UH1Vf$?uIor7P?kl*`dmK zPId*JlGo>K6?G?#`$bthPI2b?Zh8>EG`*)W`igAF@@mdJv}Ol0?6N=i+RxQ%-GR+G zEWK|16t_S9_uERWAlu#}X{6?1?)nM)nq>Dc>y=o)vRTh%#23(mam}6a(zde1R;AFYXGF3JevrbhF%&zMY91OPt(5Bb zz0aDx!#Yi!^+dCeoAsP2cby93iJ@ZQI%dzlq9tgUP}mk8*XIM>(fr(+BVFgb`=R%} zE_oye!4+bGyIEWDrhD6dmy+H7f>U zwJJoYB;#D;f=Mph)?7`q_YJU(i1r$yKqIhhVbR9Vx6JwWm8-q^c#>;+7an>~}8f?fxD102kS?FYNum)xvFY z6H@zPS0zFaxM)@05~||fc+gT=f6xe}d%EGNwaP%B7OV1l#%5gv-;5pdxF!+#+-1!; zB@ww=eTA}gm=S>EK1Yn9Aneg(h%in(5^lwvOMY6mo!yzFXr? z8JjSgE_Xc>a34jkal-4!_uoiHIvTZ}iUdnh<-ObE{pI{ z$jbt7*WnL#48=ZduwPzqI*WSZmd|dqUHrj9Ajh8jiY_x{2!cgQP%Ojb23Amo&yRSV!cUP|derH-K+%EDDe2GSL{nnd~x~^}x_ZZ|@tm+32dnQa4Ac#hpO9p#1T)G~1GZbkD=Yd?z zgQK@k=?l6P2A?Wc+z(t!GRN&E$Ikj7&p0Q=&hlf;I0eULx-w_G(qgai6rY0HQ;KdM zpn;vLexL>(Rl5u{FapGbzy?*nuP{V&X`l|4?rhnvqj9t8;P)clr^!eU@CrdUYcvE@ zTd21mlVUTQ)<9&yX`hIjtq*n$s7Hs5b^UsT0OFipr%CA`Csr&oakP#Cg>5db=)7se zk6v8yNJsRE+!NQEI!3`B^ohB0p8I65-0(&rK4gNN4O|XOH`0ei{XB_zWmqgI68jiT zQlccQRcJdv5m4XSwjivOGX5w;=Oun@&a`ltZDg_newF4qH@F}tx$)9K0^H)0}( zW>g2a%_)bX^G++2j76TG%gt!Z`7an};9!rpVBNKWa-w`;oJ1j6nXt}G7u&b&2rLKe z*2hHx&8DZfb!`5B6c{Nl|EvBE)QJqxG<{b;gG={0uk^DBod&0=3)^{S)kZL+ew(YcxCb#Y zCi6Lgb)Ze5%V8A37)a3yPmpee#k9;?-~UYVZc|))_y47YP|n^ zi^DASXKjD1hJoM!O|ugHWz=A8`jvrP%)R2=doBD%31tg<6E`%id$7Rw3c1!H6IO(w z(Ze6#EY|={m=^2yuX6CDOzBC^HrpMeE+E03QFAwJV?$V2f{uG+A^LNIpHO;UYs8An zCE$Cp>Vu!`aLiDArgN*zFyf2D3|N=~Hx(NJ*RHebslm5xD590IH*xQ_sb=VK22v3Y zXCTSIWuK;b*iy>Gb$_nQYp@A1TdS~c#dQs0x1k`1k3)RMuMV4BGn|_Whs2EIaeOj` z=ZUk8r9NG;c#sZ?yWpNF9_a;j4X2|Avn$=N{bZq@H z6bR_v&&Nh8K5X`TC(YQwQ8Z_9_FR&r6|XlmO?jK?O_5TdIhO zxdCbe(81OSYL&qn?=wHfXW-C>qYpek->&y(A%%Dx&5cZJsuMfo5sYVYDOOt`X_Ctk z_b`Ijms)zTeg&eI7#d%5F&1S<`^5>CzO19+ zl_c*!&V`=@z*pJsu1MmcDiG$~aOg|uX6cD}pJ#(&jv(r6tzi;l_iQ!-M{ue%GAu1R z!E^?jCEQn9QxwD*bOPD&XMRROIRa>btd&Y>DdF8PA7s6DMq>qLgRag+zZ{h%%Lf^o| z+rZ>@dXNH{_t`+k5HVuv-bAJ?n}^VY-Q2xwR8vwJu!D+vAz+euyWh^=S1wq366|@UWuma;UJg|geYUwrNR5-|N zoE5|agvJu2{a^Rd*}r8;BuP1t`;fEWhsObSPr(`N3wlQF}PMMRDbd{q$K z`zva9ZXIqkT{eAj*1~* zj-JFt9&|F(U;+RklH@~NIPh1%qEU%fE9eH+lxf~djz@^}T zq>PX8;Z|Z0_G+JX>)mZta zDPPMjp1X(I6EqnV1m2n0yyvnlk92W)X@0&n2u8<+Q*Hc>^)Chyh73aO;R0OhrGFiG zuS^}-z2mUj_HCd8d++z&ct*ik&|qNA9@H2#xro?2eg7ZkAV&t%`q;yT=Zj2@Yf`IT zZ!wXW76`dUQ~6z~B6+CH`JK8&?o?xg2g7VISY4Aaahzny10#2(GqkfwLW90m4voCa}RFw+oG7+wwtX{5h z$e^~SV$Uk1hKnQlrdFld!PA9;)Y->LM3x$?3!25((+<)9<)R%%12C>~G4Vd+62LJ` ztL=TA2|{=W#_REWC0?9*JjC^IX0W+@oY^TVWjVd!qgc?EaetJx9kGavSC|(E?q3Lq z!QYPC27cTNT-SYiN8jzb%vp?-5Y+mU+HX7Fl%##og3ge@8dz|!jY*&iUG{%;k%{-C z=2`8>uT#o@@q~s^{mV-)WC1@=?7O8*hG0G&d@;e(*lzxh`7%T3qz;V4P@o-NGQs4`P|p+YQn#KOONV?wYd|3pj9 zxWEsZbCClg=}-!$NMp$FZu-Ns*X0c)X(8VF#Ct+crGjL7^_MXeA=G^4)4REikC$Ic ztzDXK?VUY+cnq>DkvK!B$3^ewYw38)h(*K%#lN7rVt~rDi}r5eGIYSPHj%xlB-k!MbwXw)0-1N#dDj8jpSUG$D-9cEtEr_^;rE) z5zNMUbWWpde<<()3ooq@_gPRu8x#Q3BG#kmE7owG&L4Wsc4c2nJgj){5>9RUhcYFr zFh*TvAdauInkIdgZ_W?Ar=uST<)~LnYM4Uc$Ss(Chp2q_>mmwf5OOEz9ejgL&Gd?} z6w;1u9HBtNA+L)+mv>?FoBv4ZWQEV1Ds}$#NzCa&IEgMo4 z*uak=)JupJzpwjyFqeK4iyPx%n-X`!LA|IJE2QN3J;)^GS~yh*bZ2B_c-8sm_}(O_ zD)3MkHGU_I>;7xL9D#tnf*NX}X~7JTvMOv81)87aKV=MgJ6h%Wob^Iqj8Nqqgwh<) zg(qP#=t!Tgb?)A6$E5`$@k&ogs{WCeD1kpwm_w+s2sZIEM=zDg#TX`g;)GtP(Ww+L zP9|&&$E=6@{hl92rt-wT`M z@SsUHI>%@be-^e286|q_P{IS5YmW4qcf%rIu2ESCQY9Q|EMr*-T-~w0g0D|58SK`H zPv^r#sqfo`R&1*~&3@Jx*RA_KwsuFb{Iy_MVZihELB8yZV0=ihU18;Wt6BiRr5yY>&#_&&Ii!lGUdAG@=#20{^oNOFiXfMes`B2 zzUl~xd2sL*)ZTZS;k%u;YvrPpv1Aqe?Kexd9ICOgR2mz<B7A&n;E&iO_k2^BSt5Ie`K+ zyAQ|J*z_8Ce@+!Kcq4kCj$)W7{8cR9?|8T4g-;KVJ+7-Mng*rv2l?oh*sf5uq{vIT zekf+OqV~HsVh+ukCqK#FcgwffpN4m^A^$Dagdt5N039W3b3(_0{WtBg7w|cyeXiW3 zl-kUZDLFYrDzX@Kz&w(pG(m~PyiFizzas*A>deQ?K!>Zy752`}~XR+Y( zm8yJ*DA~GfpkWriV~6t1y1zeDlDW=bn4};##Fo~IdOHYwd_U0~6fo+wFjLF;I0|fK*5IAbEHVavluGUb zF+db!zzXZT@@^0{zPCgwrA|!?T$20&awyuJNtAgXjhYSu$S5EdKnd{2)+#YxMrjq? z#J=i!(M{KPUgDgVgw86AAY+MCcf05C{@9zunX%3 zpyVUsb0$|*RHWA<%}U|Khh6q#y7V={``_{QAxNCRXCH~c7fhL8yg)63hE-rB~tM*CHxMDecbF1r_zHM$%fWK@Oru5 z=dd;s8eV_nsz7n#0N4)&dPXx+hNvz!2Jn>PrcSUNeovmC_A)Fxe#Dx3Z4bpHb>1wE zu@I6Ka)K|MtF&y-F&aBb06HD1r<8cx{eY9W_BlsuT+7x>wTD ztG9?DeB6BOzWl-W==C)T&5W9z=mjMIXbb02H8QRbXQ_@-hM2nU^7*glR7I>Qj-lT- z8oEqyz_t^}OofPyh5W^opotEI0(?F%C<18kVo@ZhFuq9D%1I8YD;davVlkCcfN1uU zF9KF^yzVClbaoqLSJ#V%)d~DrEM0-#y76jIy$tYKzh6L5puleIB zcJw+(evIGEJ~f@LZdo4nwpp&WN-+8Gs}e2XP>z?k)fd?dab8;^?>jo8FuNVf+x#f) zsSwc9(fKA8j^ssB!7;8lOd}HJ^^P8Dk1EHfjQqPj-marP~y; zsoz1W%f_w`9k1T8<6Lr1Bi06vX)O)3*>h4xz@{YT2FLUF-VhT9mQ^skl^^Ndd0C$4k*yl7Lp-dSj%;_qw-~D_^|77K02?!Jz@i zcO!jN4?OHVyqgM?Y=J>obN?LWS;>`h|aO|T4P+iM7`YQU@C83 z?z7e<_90GrT;~kZAQxVjaP7GZ0jyg#NUZ;r(b_uwvDX&m_RZk+^TZXS4>SA zz+7s6%9LjcCz|iHJtT7o^9ZpyH}dYm>7aWvBOu3vUIqF6_IrijD83Qgl)^x_bhsLE zc-|)Q>zyfH`@Fb$HakRBA-4by1Q!BO3L-WX@Y9kyX_{NZ|EP69BKeCb;J4V6L(ifv zDEUJ5iy0WpQMDd0T$QR4>+u)b}>p<4f)O#4N1_+2nl4{38hwdJ