diff --git a/docs/docs/integrations/document_loaders/firecrawl.ipynb b/docs/docs/integrations/document_loaders/firecrawl.ipynb index f369b7c95c0..5a1218cb7a4 100644 --- a/docs/docs/integrations/document_loaders/firecrawl.ipynb +++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ { "data": { "text/plain": [ - "Document(metadata={'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-02T12:09:44.527Z', 'priority': 0.5, 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 500, 'ogLocaleAlternate': []}, page_content='Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\\n Join the waitlist to turn any website into an API with AI\\n\\n\\n\\n[🔥 Firecrawl](/)\\n\\n* [Playground](/playground)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](/blog)\\n \\n* Beta Features\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n 8.6k\\n\\n[💥 Get 2 months free with yearly plan](/pricing)\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nPower your AI apps with clean data crawled from any website. It\\'s also open-source.\\n\\nStart for free (500 credits)Start for free[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nA product by\\n\\n[![Mendable Logo](https://www.firecrawl.dev/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Example Webpage](https://www.firecrawl.dev/multiple-websites.png)\\n\\nCrawl, Scrape, Clean\\n--------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/\",\\\\\\n \"markdown\": \"## Welcome to Firecrawl\\\\\\n Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Firecrawl\\'s cutting-edge features can \\\\\\n transform your data operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n Learn more about Firecrawl\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nTrusted by Top Companies\\n------------------------\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/zapier.png)](https://www.zapier.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/gamma.svg)](https://gamma.app)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nvidia-com.png)](https://www.nvidia.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/teller-io.svg)](https://www.teller.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/stackai.svg)](https://www.stack-ai.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/palladiumdigital-co-uk.svg)](https://www.palladiumdigital.co.uk)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/worldwide-casting-com.svg)](https://www.worldwide-casting.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/open-gov-sg.png)](https://www.open.gov.sg)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/bain-com.svg)](https://www.bain.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/demand-io.svg)](https://www.demand.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nocodegarden-io.png)](https://www.nocodegarden.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/cyberagent-co-jp.svg)](https://www.cyberagent.co.jp)\\n\\nIntegrate today\\n---------------\\n\\nEnhance your applications with top-tier web scraping and crawling capabilities.\\n\\n#### Scrape\\n\\nExtract markdown or structured data from websites quickly and efficiently.\\n\\n#### Crawling\\n\\nNavigate and retrieve data from all accessible subpages, even without a sitemap.\\n\\nNode.js\\n\\nPython\\n\\ncURL\\n\\n1\\n\\n2\\n\\n3\\n\\n4\\n\\n5\\n\\n6\\n\\n7\\n\\n8\\n\\n9\\n\\n10\\n\\n// npm install @mendable/firecrawl-js \\n \\nimport FirecrawlApp from \\'@mendable/firecrawl-js\\'; \\n \\nconst app \\\\= new FirecrawlApp({ apiKey: \"fc-YOUR\\\\_API\\\\_KEY\" }); \\n \\n// Scrape a website: \\nconst scrapeResult \\\\= await app.scrapeUrl(\\'firecrawl.dev\\'); \\n \\nconsole.log(scrapeResult.data.markdown)\\n\\n#### Use well-known tools\\n\\nAlready fully integrated with the greatest existing tools and workflows.\\n\\n[![LlamaIndex](https://www.firecrawl.dev/logos/llamaindex.svg)](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-firecrawl-reader/)\\n[![Langchain](https://www.firecrawl.dev/integrations/langchain.png)](https://python.langchain.com/v0.2/docs/integrations/document_loaders/firecrawl/)\\n[![Dify](https://www.firecrawl.dev/logos/dify.png)](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl/)\\n[![Dify](https://www.firecrawl.dev/integrations/langflow_2.png)](https://www.langflow.org/)\\n[![Flowise](https://www.firecrawl.dev/integrations/flowise.png)](https://flowiseai.com/)\\n[![CrewAI](https://www.firecrawl.dev/integrations/crewai.png)](https://crewai.com/)\\n\\n#### Start for free, scale easily\\n\\nKick off your journey for free and scale seamlessly as your project expands.\\n\\n[Try it out](/signin/signup)\\n\\n#### Open-source\\n\\nDeveloped transparently and collaboratively. Join our community of contributors.\\n\\n[Check out our repo](https://github.com/mendableai/firecrawl)\\n\\nWe handle the hard stuff\\n------------------------\\n\\nRotating proxies, caching, rate limits, js-blocked content and more\\n\\n#### Crawling\\n\\nFirecrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFirecrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFirecrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Crawling Orchestration\\n\\nFirecrawl orchestrates the crawling process in parallel for the fastest results.\\n\\n#### Caching\\n\\nFirecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nOur wall of love\\n\\nDon\\'t take our word for it\\n--------------------------\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce 🧘🏽](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce 🧘🏽\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce 🧘🏽](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce 🧘🏽\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman 🖇️](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman 🖇️\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman 🖇️](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman 🖇️\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\nFlexible Pricing\\n----------------\\n\\nStart for free, then scale as you grow\\n\\nYearly (17% off)Yearly (2 months free)Monthly\\n\\nFree Plan\\n---------\\n\\n500 credits\\n\\n$0/month\\n\\n* Scrape 500 pages\\n* 5 /scrape per min\\n* 1 /crawl per min\\n\\nGet Started\\n\\nHobby\\n-----\\n\\n3,000 credits\\n\\n$16/month\\n\\n* Scrape 3,000 pages\\n* 10 /scrape per min\\n* 3 /crawl per min\\n\\nSubscribe\\n\\nStandardMost Popular\\n--------------------\\n\\n100,000 credits\\n\\n$83/month\\n\\n* Scrape 100,000 pages\\n* 50 /scrape per min\\n* 10 /crawl per min\\n\\nSubscribe\\n\\nGrowth\\n------\\n\\n500,000 credits\\n\\n$333/month\\n\\n* Scrape 500,000 pages\\n* 500 /scrape per min\\n* 50 /crawl per min\\n* Priority Support\\n\\nSubscribe\\n\\nEnterprise Plan\\n---------------\\n\\nUnlimited credits. Custom RPMs.\\n\\nTalk to us\\n\\n* Top priority support\\n* Feature Acceleration\\n* SLAs\\n* Account Manager\\n* Custom rate limits volume\\n* Custom concurrency limits\\n* Beta features access\\n* CEO\\'s number\\n\\n\\\\* a /scrape refers to the [scrape](https://docs.firecrawl.dev/api-reference/endpoint/scrape)\\n API endpoint.\\n\\n\\\\* a /crawl refers to the [crawl](https://docs.firecrawl.dev/api-reference/endpoint/crawl)\\n API endpoint.\\n\\nScrape Credits\\n--------------\\n\\nScrape credits are consumed for each API request, varying by endpoint and feature.\\n\\n| Features | Credits per page |\\n| --- | --- |\\n| Scrape(/scrape) | 1 |\\n| Crawl(/crawl) | 1 |\\n| Search(/search) | 1 |\\n| Scrape + LLM extraction (/scrape) | 50 |\\n\\n[](/)\\n\\nReady to _Build?_\\n-----------------\\n\\nStart scraping web data for your AI apps today. \\nNo credit card needed.\\n\\n[Get Started](/signin)\\n\\n[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about Firecrawl\\n\\n#### General\\n\\nWhat is Firecrawl?\\n\\nFirecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.\\n\\nWhat sites work?\\n\\nFirecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.\\n\\nWho can benefit from using Firecrawl?\\n\\nFirecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\nIs Firecrawl open-source?\\n\\nYes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.\\n\\n#### Scraping & Crawling\\n\\nHow does Firecrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nWhy is it not crawling all the pages?\\n\\nThere are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at hello@firecrawl.com.\\n\\nCan Firecrawl crawl websites without a sitemap?\\n\\nYes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can Firecrawl convert web data into?\\n\\nFirecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does Firecrawl ensure the cleanliness of the data?\\n\\nFirecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs Firecrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nDoes it respect robots.txt?\\n\\nYes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at hello@firecrawl.com.\\n\\nWhat measures does Firecrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFirecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nDoes Firecrawl handle captcha or authentication?\\n\\nFirecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.\\n\\n#### API Related\\n\\nWhere can I find my API key?\\n\\nClick on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.\\n\\n#### Billing\\n\\nIs Firecrawl free?\\n\\nFirecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.\\n\\nIs there a pay per use plan instead of monthly?\\n\\nNo we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.\\n\\nHow many credit does scraping, crawling, and extraction cost?\\n\\nScraping costs 1 credit per page. Crawling costs 1 credit per page.\\n\\nDo you charge for failed requests (scrape, crawl, extract)?\\n\\nWe do not charge for any failed requests (scrape, crawl, extract). Please contact support at help@firecrawl.dev if you have any questions.\\n\\nWhat payment methods do you accept?\\n\\nWe accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.\\n\\n[🔥](/)\\n\\n© A product by Mendable.ai - All rights reserved.\\n\\n[StatusStatus](https://firecrawl.betteruptime.com)\\n[Terms of ServiceTerms of Service](/terms-of-service)\\n[Privacy PolicyPrivacy Policy](/privacy-policy)\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/mendableai)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\n###### Helpful Links\\n\\n* [Status](https://firecrawl.betteruptime.com/)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](https://www.firecrawl.dev/blog)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n\\nBacked by![Y Combinator Logo](https://www.firecrawl.dev/images/yc.svg)\\n\\n![SOC 2 Type II](https://www.firecrawl.dev/soc2type2badge.png)\\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)')" + "Document(metadata={'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}, page_content='Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\\n Join the waitlist to turn any website into an API with AI\\n\\n\\n\\n[🔥 Firecrawl](/)\\n\\n* [Playground](/playground)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](/blog)\\n \\n* Beta Features\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n 8.9k\\n\\n[💥 Get 2 months free with yearly plan](/pricing)\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nPower your AI apps with clean data crawled from any website. It\\'s also open-source.\\n\\nStart for free (500 credits)Start for free[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nA product by\\n\\n[![Mendable Logo](https://www.firecrawl.dev/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Example Webpage](https://www.firecrawl.dev/multiple-websites.png)\\n\\nCrawl, Scrape, Clean\\n--------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/\",\\\\\\n \"markdown\": \"## Welcome to Firecrawl\\\\\\n Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Firecrawl\\'s cutting-edge features can \\\\\\n transform your data operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n Learn more about Firecrawl\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nTrusted by Top Companies\\n------------------------\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/zapier.png)](https://www.zapier.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/gamma.svg)](https://gamma.app)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nvidia-com.png)](https://www.nvidia.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/teller-io.svg)](https://www.teller.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/stackai.svg)](https://www.stack-ai.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/palladiumdigital-co-uk.svg)](https://www.palladiumdigital.co.uk)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/worldwide-casting-com.svg)](https://www.worldwide-casting.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/open-gov-sg.png)](https://www.open.gov.sg)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/bain-com.svg)](https://www.bain.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/demand-io.svg)](https://www.demand.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nocodegarden-io.png)](https://www.nocodegarden.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/cyberagent-co-jp.svg)](https://www.cyberagent.co.jp)\\n\\nIntegrate today\\n---------------\\n\\nEnhance your applications with top-tier web scraping and crawling capabilities.\\n\\n#### Scrape\\n\\nExtract markdown or structured data from websites quickly and efficiently.\\n\\n#### Crawling\\n\\nNavigate and retrieve data from all accessible subpages, even without a sitemap.\\n\\nNode.js\\n\\nPython\\n\\ncURL\\n\\n1\\n\\n2\\n\\n3\\n\\n4\\n\\n5\\n\\n6\\n\\n7\\n\\n8\\n\\n9\\n\\n10\\n\\n// npm install @mendable/firecrawl-js \\n \\nimport FirecrawlApp from \\'@mendable/firecrawl-js\\'; \\n \\nconst app \\\\= new FirecrawlApp({ apiKey: \"fc-YOUR\\\\_API\\\\_KEY\" }); \\n \\n// Scrape a website: \\nconst scrapeResult \\\\= await app.scrapeUrl(\\'firecrawl.dev\\'); \\n \\nconsole.log(scrapeResult.data.markdown)\\n\\n#### Use well-known tools\\n\\nAlready fully integrated with the greatest existing tools and workflows.\\n\\n[![LlamaIndex](https://www.firecrawl.dev/logos/llamaindex.svg)](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-firecrawl-reader/)\\n[![Langchain](https://www.firecrawl.dev/integrations/langchain.png)](https://python.langchain.com/v0.2/docs/integrations/document_loaders/firecrawl/)\\n[![Dify](https://www.firecrawl.dev/logos/dify.png)](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl/)\\n[![Dify](https://www.firecrawl.dev/integrations/langflow_2.png)](https://www.langflow.org/)\\n[![Flowise](https://www.firecrawl.dev/integrations/flowise.png)](https://flowiseai.com/)\\n[![CrewAI](https://www.firecrawl.dev/integrations/crewai.png)](https://crewai.com/)\\n\\n#### Start for free, scale easily\\n\\nKick off your journey for free and scale seamlessly as your project expands.\\n\\n[Try it out](/signin/signup)\\n\\n#### Open-source\\n\\nDeveloped transparently and collaboratively. Join our community of contributors.\\n\\n[Check out our repo](https://github.com/mendableai/firecrawl)\\n\\nWe handle the hard stuff\\n------------------------\\n\\nRotating proxies, caching, rate limits, js-blocked content and more\\n\\n#### Crawling\\n\\nFirecrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFirecrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFirecrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Crawling Orchestration\\n\\nFirecrawl orchestrates the crawling process in parallel for the fastest results.\\n\\n#### Caching\\n\\nFirecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nOur wall of love\\n\\nDon\\'t take our word for it\\n--------------------------\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce 🧘🏽](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce 🧘🏽\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce 🧘🏽](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce 🧘🏽\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman 🖇️](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman 🖇️\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman 🖇️](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman 🖇️\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\nFlexible Pricing\\n----------------\\n\\nStart for free, then scale as you grow\\n\\nYearly (17% off)Yearly (2 months free)Monthly\\n\\nFree Plan\\n---------\\n\\n500 credits\\n\\n$0/month\\n\\n* Scrape 500 pages\\n* 5 /scrape per min\\n* 1 /crawl per min\\n\\nGet Started\\n\\nHobby\\n-----\\n\\n3,000 credits\\n\\n$16/month\\n\\n* Scrape 3,000 pages\\n* 10 /scrape per min\\n* 3 /crawl per min\\n\\nSubscribe\\n\\nStandardMost Popular\\n--------------------\\n\\n100,000 credits\\n\\n$83/month\\n\\n* Scrape 100,000 pages\\n* 50 /scrape per min\\n* 10 /crawl per min\\n\\nSubscribe\\n\\nGrowth\\n------\\n\\n500,000 credits\\n\\n$333/month\\n\\n* Scrape 500,000 pages\\n* 500 /scrape per min\\n* 50 /crawl per min\\n* Priority Support\\n\\nSubscribe\\n\\nEnterprise Plan\\n---------------\\n\\nUnlimited credits. Custom RPMs.\\n\\nTalk to us\\n\\n* Top priority support\\n* Feature Acceleration\\n* SLAs\\n* Account Manager\\n* Custom rate limits volume\\n* Custom concurrency limits\\n* Beta features access\\n* CEO\\'s number\\n\\n\\\\* a /scrape refers to the [scrape](https://docs.firecrawl.dev/api-reference/endpoint/scrape)\\n API endpoint.\\n\\n\\\\* a /crawl refers to the [crawl](https://docs.firecrawl.dev/api-reference/endpoint/crawl)\\n API endpoint.\\n\\nScrape Credits\\n--------------\\n\\nScrape credits are consumed for each API request, varying by endpoint and feature.\\n\\n| Features | Credits per page |\\n| --- | --- |\\n| Scrape(/scrape) | 1 |\\n| Crawl(/crawl) | 1 |\\n| Search(/search) | 1 |\\n| Scrape + LLM extraction (/scrape) | 50 |\\n\\n[🔥](/)\\n\\nReady to _Build?_\\n-----------------\\n\\nStart scraping web data for your AI apps today. \\nNo credit card needed.\\n\\n[Get Started](/signin)\\n\\n[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about Firecrawl\\n\\n#### General\\n\\nWhat is Firecrawl?\\n\\nFirecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.\\n\\nWhat sites work?\\n\\nFirecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.\\n\\nWho can benefit from using Firecrawl?\\n\\nFirecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\nIs Firecrawl open-source?\\n\\nYes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.\\n\\n#### Scraping & Crawling\\n\\nHow does Firecrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nWhy is it not crawling all the pages?\\n\\nThere are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at hello@firecrawl.com.\\n\\nCan Firecrawl crawl websites without a sitemap?\\n\\nYes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can Firecrawl convert web data into?\\n\\nFirecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does Firecrawl ensure the cleanliness of the data?\\n\\nFirecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs Firecrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nDoes it respect robots.txt?\\n\\nYes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at hello@firecrawl.com.\\n\\nWhat measures does Firecrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFirecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nDoes Firecrawl handle captcha or authentication?\\n\\nFirecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.\\n\\n#### API Related\\n\\nWhere can I find my API key?\\n\\nClick on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.\\n\\n#### Billing\\n\\nIs Firecrawl free?\\n\\nFirecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.\\n\\nIs there a pay per use plan instead of monthly?\\n\\nNo we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.\\n\\nHow many credit does scraping, crawling, and extraction cost?\\n\\nScraping costs 1 credit per page. Crawling costs 1 credit per page.\\n\\nDo you charge for failed requests (scrape, crawl, extract)?\\n\\nWe do not charge for any failed requests (scrape, crawl, extract). Please contact support at help@firecrawl.dev if you have any questions.\\n\\nWhat payment methods do you accept?\\n\\nWe accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.\\n\\n[🔥](/)\\n\\n© A product by Mendable.ai - All rights reserved.\\n\\n[StatusStatus](https://firecrawl.betteruptime.com)\\n[Terms of ServiceTerms of Service](/terms-of-service)\\n[Privacy PolicyPrivacy Policy](/privacy-policy)\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/mendableai)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\n###### Helpful Links\\n\\n* [Status](https://firecrawl.betteruptime.com/)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](https://www.firecrawl.dev/blog)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n\\nBacked by![Y Combinator Logo](https://www.firecrawl.dev/images/yc.svg)\\n\\n![SOC 2 Type II](https://www.firecrawl.dev/soc2type2badge.png)\\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)')" ] }, "execution_count": 4, @@ -118,14 +118,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-02T12:09:44.527Z', 'priority': 0.5, 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 500, 'ogLocaleAlternate': []}\n" + "{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" ] } ], @@ -144,38 +144,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "page = []\n", + "pages = []\n", "for doc in loader.lazy_load():\n", - " page.append(doc)\n", - " if len(page) >= 10:\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", " # do some paged operation, e.g.\n", " # index.upsert(page)\n", "\n", - " page = []" + " pages = []" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1" + "8" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(page)" + "len(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\n", + " Join the waitlist to turn any web\n", + "{'ogUrl': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'title': 'Introducing Fire Engine for Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/images/blog/fire-engine-launch.png', 'ogTitle': 'Introducing Fire Engine for Firecrawl', 'sitemap': {'lastmod': '2024-08-06T00:00:00.000Z', 'changefreq': 'weekly'}, 'keywords': 'firecrawl,fireengine,web crawling,dashboard,web scraping,LLM,data extraction', 'sourceURL': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'ogSiteName': 'Firecrawl', 'description': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'ogDescription': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" + ] + } + ], + "source": [ + "print(pages[0].page_content[:100])\n", + "print(pages[0].metadata)" ] }, { diff --git a/docs/docs/integrations/document_loaders/pypdfloader.ipynb b/docs/docs/integrations/document_loaders/pypdfloader.ipynb index a10d096121b..9debf5cf183 100644 --- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb +++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -131,21 +131,41 @@ "6" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "page = []\n", + "pages = []\n", "for doc in loader.lazy_load():\n", - " page.append(doc)\n", - " if len(page) >= 10:\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", " # do some paged operation, e.g.\n", " # index.upsert(page)\n", "\n", - " page = []\n", - "len(page)" + " pages = []\n", + "len(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LayoutParser : A Unified Toolkit for DL-Based DIA 11\n", + "focuses on precision, efficiency, and robustness. \n", + "{'source': './example_data/layout-parser-paper.pdf', 'page': 10}\n" + ] + } + ], + "source": [ + "print(pages[0].page_content[:100])\n", + "print(pages[0].metadata)" ] }, { diff --git a/docs/docs/integrations/document_loaders/recursive_url.ipynb b/docs/docs/integrations/document_loaders/recursive_url.ipynb index 310fc33a5e0..4ea5731726c 100644 --- a/docs/docs/integrations/document_loaders/recursive_url.ipynb +++ b/docs/docs/integrations/document_loaders/recursive_url.ipynb @@ -229,14 +229,14 @@ } ], "source": [ - "page = []\n", + "pages = []\n", "for doc in loader.lazy_load():\n", - " page.append(doc)\n", - " if len(page) >= 10:\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", " # do some paged operation, e.g.\n", " # index.upsert(page)\n", "\n", - " page = []" + " pages = []" ] }, { diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb index 6a233b30e5a..e41c40244e1 100644 --- a/docs/docs/integrations/document_loaders/web_base.ipynb +++ b/docs/docs/integrations/document_loaders/web_base.ipynb @@ -250,27 +250,80 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "303d2f4e", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "Document(metadata={'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}, page_content=\"\\n\\n\\n\\n\\n\\n\\n\\n\\nESPN - Serving Sports Fans. Anytime. Anywhere.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n Skip to main content\\n \\n\\n Skip to navigation\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n<\\n\\n>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMenuESPN\\n\\n\\n\\n\\n\\nscores\\n\\n\\n\\nNFLNBAMLBOlympicsSoccerWNBA…BoxingCFLNCAACricketF1GolfHorseLLWSMMANASCARNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNHLNWSLPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisX GamesUFLMore ESPNFantasyWatchESPN BETESPN+\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nSubscribe Now\\n\\n\\n\\n\\n\\nBoxing: Crawford vs. Madrimov (ESPN+ PPV)\\n\\n\\n\\n\\n\\n\\n\\nPFL Playoffs: Heavyweights & Women's Flyweights\\n\\n\\n\\n\\n\\n\\n\\nMLB\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball: Regionals\\n\\n\\n\\n\\n\\n\\n\\nIn The Arena: Serena Williams\\n\\n\\n\\n\\n\\n\\n\\nThe 30 College Football Playoff Contenders\\n\\n\\nQuick Links\\n\\n\\n\\n\\n2024 Paris Olympics\\n\\n\\n\\n\\n\\n\\n\\nOlympics: Everything to Know\\n\\n\\n\\n\\n\\n\\n\\nMLB Standings\\n\\n\\n\\n\\n\\n\\n\\nSign up: Fantasy Football\\n\\n\\n\\n\\n\\n\\n\\nWNBA Rookie Tracker\\n\\n\\n\\n\\n\\n\\n\\nNBA Free Agency Buzz\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball, Softball\\n\\n\\n\\n\\n\\n\\n\\nESPN Radio: Listen Live\\n\\n\\n\\n\\n\\n\\n\\nWatch Golf on ESPN\\n\\n\\n\\n\\n\\n\\nFavorites\\n\\n\\n\\n\\n\\n\\n Manage Favorites\\n \\n\\n\\n\\nCustomize ESPNCreate AccountLog InFantasy\\n\\n\\n\\n\\nFootball\\n\\n\\n\\n\\n\\n\\n\\nBaseball\\n\\n\\n\\n\\n\\n\\n\\nBasketball\\n\\n\\n\\n\\n\\n\\n\\nHockey\\n\\n\\nESPN Sites\\n\\n\\n\\n\\nESPN Deportes\\n\\n\\n\\n\\n\\n\\n\\nAndscape\\n\\n\\n\\n\\n\\n\\n\\nespnW\\n\\n\\n\\n\\n\\n\\n\\nESPNFC\\n\\n\\n\\n\\n\\n\\n\\nX Games\\n\\n\\n\\n\\n\\n\\n\\nSEC Network\\n\\n\\nESPN Apps\\n\\n\\n\\n\\nESPN\\n\\n\\n\\n\\n\\n\\n\\nESPN Fantasy\\n\\n\\n\\n\\n\\n\\n\\nTournament Challenge\\n\\n\\nFollow ESPN\\n\\n\\n\\n\\nFacebook\\n\\n\\n\\n\\n\\n\\n\\nX/Twitter\\n\\n\\n\\n\\n\\n\\n\\nInstagram\\n\\n\\n\\n\\n\\n\\n\\nSnapchat\\n\\n\\n\\n\\n\\n\\n\\nTikTok\\n\\n\\n\\n\\n\\n\\n\\nYouTube\\n\\n\\nCollege football's most entertaining conference? Why the 16-team Big 12 is wiiiiiide open this seasonLong known as one of the sport's most unpredictable conferences, the new-look Big 12 promises another dose of chaos.11hBill ConnellyScott Winters/Icon SportswireUSC, Oregon and the quest to bulk up for the Big TenTo improve on D, the Trojans wanted to bulk up for a new league. They're not the only team trying to do that.10hAdam RittenbergThe 30 teams that can reach the CFPConnelly's best games of the 2024 seasonTOP HEADLINESTeam USA sets world record in 4x400 mixed relayGermany beats France, undefeated in men's hoopsU.S. men's soccer exits Games after Morocco routHungary to protest Khelif's Olympic participationKobe's Staples Center locker sells for record $2.9MDjokovic, Alcaraz to meet again, this time for goldKerr: Team USA lineups based on players' rolesMarchand wins 200m IM; McEvoy takes 50 freeScouting Shedeur Sanders' NFL futureLATEST FROM PARISBreakout stars, best moments and what comes next: Breaking down the Games halfway throughAt the midpoint of the Olympics, we look back at some of the best moments and forward at what's still to come in the second week.35mESPNMustafa Yalcin/Anadolu via Getty ImagesThe numbers behind USA's world record in 4x400 mixed relay4h0:46Medal trackerFull resultsFull coverage of the OlympicsPRESEASON HAS BEGUN!New kickoff rules on display to start Hall of Fame Game19h0:41McAfee on NFL's new kickoff: It looks like a practice drill6h1:11NFL's new kickoff rules debut to mixed reviewsTexans-Bears attracts more bets than MLBTOP RANK BOXINGSATURDAY ON ESPN+ PPVWhy Terence Crawford is playing the long game for a chance to face CaneloCrawford is approaching Saturday's marquee matchup against Israil Madrimov with his sights set on landing a bigger one next: against Canelo Alvarez.10hMike CoppingerMark Robinson/Matchroom BoxingBradley's take: Crawford's power vs. Israil Madrimov's disciplined styleTimothy Bradley Jr. breaks down the junior middleweight title fight.2dTimothy Bradley Jr.Buy Crawford vs. Madrimov on ESPN+ PPVChance to impress for Madrimov -- and UzbekistanHOW FRIDAY WENTMORE FROM THE PARIS OLYMPICSGrant Fisher makes U.S. track history, Marchand wins 4th gold and more Friday at the Paris GamesSha'Carri Richardson made her long-awaited Olympic debut during the women's 100m preliminary round on Friday. Here's everything else you might have missed from Paris.31mESPNGetty ImagesU.S. men's loss to Morocco is a wake-up call before World CupOutclassed by Morocco, the U.S. men's Olympic team can take plenty of lessons with the World Cup on the horizon.4hSam BordenFull coverage of the OlympicsOLYMPIC MEN'S HOOPS SCOREBOARDFRIDAY'S GAMESOLYMPIC STANDOUTSWhy Simone Biles is now Stephen A.'s No. 1 Olympian7h0:58Alcaraz on the cusp of history after securing spot in gold medal match8h0:59Simone Biles' gymnastics titles: Olympics, Worlds, more statsOLYMPIC MEN'S SOCCER SCOREBOARDFRIDAY'S GAMESTRADE DEADLINE FALLOUTOlney: Eight big questions for traded MLB playersCan Jazz Chisholm Jr. handle New York? Is Jack Flaherty healthy? Will Jorge Soler's defense play? Key questions for players in new places.10hBuster OlneyWinslow Townson/Getty ImagesRanking the top prospects who changed teams at the MLB trade deadlineYou know the major leaguers who moved by now -- but what about the potential stars of tomorrow?1dKiley McDanielMLB Power RankingsSeries odds: Dodgers still on top; Phillies, Yanks behind them Top HeadlinesTeam USA sets world record in 4x400 mixed relayGermany beats France, undefeated in men's hoopsU.S. men's soccer exits Games after Morocco routHungary to protest Khelif's Olympic participationKobe's Staples Center locker sells for record $2.9MDjokovic, Alcaraz to meet again, this time for goldKerr: Team USA lineups based on players' rolesMarchand wins 200m IM; McEvoy takes 50 freeScouting Shedeur Sanders' NFL futureFavorites FantasyManage FavoritesFantasy HomeCustomize ESPNCreate AccountLog InICYMI0:47Nelson Palacio rips an incredible goal from outside the boxNelson Palacio scores an outside-of-the-box goal for Real Salt Lake in the 79th minute. \\n\\n\\nMedal Tracker\\n\\n\\n\\nCountries\\nAthletes\\n\\nOverall Medal Leaders43USA36FRA31CHNIndividual Medal LeadersGoldCHN 13FRA 11AUS 11SilverUSA 18FRA 12GBR 10BronzeUSA 16FRA 13CHN 9Overall Medal Leaders4MarchandMarchand3O'CallaghanO'Callaghan3McIntoshMcIntoshIndividual Medal LeadersGoldMarchand 4O'Callaghan 3McIntosh 2SilverSmith 3Huske 2Walsh 2BronzeYufei 3Bhaker 2Haughey 2\\n\\n\\nFull Medal Tracker\\n\\n\\nBest of ESPN+ESPNCollege Football Playoff 2024: 30 teams can reach postseasonHeather Dinich analyzes the teams with at least a 10% chance to make the CFP.AP Photo/Ross D. FranklinNFL Hall of Fame predictions: Who will make the next 10 classes?When will Richard Sherman and Marshawn Lynch make it? Who could join Aaron Donald in 2029? Let's map out each Hall of Fame class until 2034.Thearon W. Henderson/Getty ImagesMLB trade deadline 2024: Ranking prospects who changed teamsYou know the major leaguers who moved by now -- but what about the potential stars of tomorrow who went the other way in those deals? Trending NowIllustration by ESPNRanking the top 100 professional athletes since 2000Who tops our list of the top athletes since 2000? We're unveiling the top 25, including our voters' pick for the No. 1 spot.Photo by Kevin C. Cox/Getty Images2024 NFL offseason recap: Signings, coach moves, new rulesThink you missed something in the NFL offseason? We've got you covered with everything important that has happened since February.Stacy Revere/Getty ImagesTop 25 college football stadiums: Rose Bowl, Michigan and moreFourteen of ESPN's college football writers rank the 25 best stadiums in the sport. Who's No. 1, who missed the cut and what makes these stadiums so special?ESPNInside Nate Robinson's silent battle -- and his fight to liveFor nearly 20 years Nate Robinson has been fighting a silent battle -- one he didn't realize until recently could end his life. Sign up to play the #1 Fantasy game!Create A LeagueJoin Public LeagueReactivate A LeagueMock Draft NowSign up for FREE!Create A LeagueJoin a Public LeagueReactivate a LeaguePractice With a Mock DraftSign up for FREE!Create A LeagueJoin a Public LeagueReactivate a LeaguePractice with a Mock DraftGet a custom ESPN experienceEnjoy the benefits of a personalized accountSelect your favorite leagues, teams and players and get the latest scores, news and updates that matter most to you. \\n\\nESPN+\\n\\n\\n\\n\\nBoxing: Crawford vs. Madrimov (ESPN+ PPV)\\n\\n\\n\\n\\n\\n\\n\\nPFL Playoffs: Heavyweights & Women's Flyweights\\n\\n\\n\\n\\n\\n\\n\\nMLB\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball: Regionals\\n\\n\\n\\n\\n\\n\\n\\nIn The Arena: Serena Williams\\n\\n\\n\\n\\n\\n\\n\\nThe 30 College Football Playoff Contenders\\n\\n\\nQuick Links\\n\\n\\n\\n\\n2024 Paris Olympics\\n\\n\\n\\n\\n\\n\\n\\nOlympics: Everything to Know\\n\\n\\n\\n\\n\\n\\n\\nMLB Standings\\n\\n\\n\\n\\n\\n\\n\\nSign up: Fantasy Football\\n\\n\\n\\n\\n\\n\\n\\nWNBA Rookie Tracker\\n\\n\\n\\n\\n\\n\\n\\nNBA Free Agency Buzz\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball, Softball\\n\\n\\n\\n\\n\\n\\n\\nESPN Radio: Listen Live\\n\\n\\n\\n\\n\\n\\n\\nWatch Golf on ESPN\\n\\n\\nFantasy\\n\\n\\n\\n\\nFootball\\n\\n\\n\\n\\n\\n\\n\\nBaseball\\n\\n\\n\\n\\n\\n\\n\\nBasketball\\n\\n\\n\\n\\n\\n\\n\\nHockey\\n\\n\\nESPN Sites\\n\\n\\n\\n\\nESPN Deportes\\n\\n\\n\\n\\n\\n\\n\\nAndscape\\n\\n\\n\\n\\n\\n\\n\\nespnW\\n\\n\\n\\n\\n\\n\\n\\nESPNFC\\n\\n\\n\\n\\n\\n\\n\\nX Games\\n\\n\\n\\n\\n\\n\\n\\nSEC Network\\n\\n\\nESPN Apps\\n\\n\\n\\n\\nESPN\\n\\n\\n\\n\\n\\n\\n\\nESPN Fantasy\\n\\n\\n\\n\\n\\n\\n\\nTournament Challenge\\n\\n\\nFollow ESPN\\n\\n\\n\\n\\nFacebook\\n\\n\\n\\n\\n\\n\\n\\nX/Twitter\\n\\n\\n\\n\\n\\n\\n\\nInstagram\\n\\n\\n\\n\\n\\n\\n\\nSnapchat\\n\\n\\n\\n\\n\\n\\n\\nTikTok\\n\\n\\n\\n\\n\\n\\n\\nYouTube\\n\\n\\nTerms of UsePrivacy PolicyYour US State Privacy RightsChildren's Online Privacy PolicyInterest-Based AdsAbout Nielsen MeasurementDo Not Sell or Share My Personal InformationContact UsDisney Ad Sales SiteWork for ESPNCorrectionsESPN BET is owned and operated by PENN Entertainment, Inc. and its subsidiaries ('PENN'). ESPN BET is available in states where PENN is licensed to offer sports wagering. Must be 21+ to wager. If you or someone you know has a gambling problem and wants help, call 1-800-GAMBLER.Copyright: © 2024 ESPN Enterprises, Inc. All rights reserved.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\")" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ESPN - Serving Sports Fans. Anytime. Anywhere.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}\n" + ] } ], "source": [ - "page = []\n", + "pages = []\n", "for doc in loader.lazy_load():\n", - " page.append(doc)\n", + " pages.append(doc)\n", "\n", - "page[0]" + "print(pages[0].page_content[:100])\n", + "print(pages[0].metadata)" ] }, { diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py index 2577ce6eda4..467813419ae 100644 --- a/libs/community/langchain_community/document_loaders/firecrawl.py +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env class FireCrawlLoader(BaseLoader): - """Load web pages as Documents using FireCrawl. - - Must have Python package `firecrawl` installed and a FireCrawl API key. See - https://www.firecrawl.dev/ for more. """ + FireCrawlLoader document loader integration + + Setup: + Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``. + + .. code-block:: bash + + pip install -U firecrawl-py langchain_community + export FIRECRAWL_API_KEY="your-api-key" + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import FireCrawlLoader + + loader = FireCrawlLoader( + url = "https://firecrawl.dev", + mode = "crawl" + # other params = ... + ) + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) + Join the waitlist to turn any web + {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} + + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) + Join the waitlist to turn any web + {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} + + """ # noqa: E501 def __init__( self, diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index ea4485e5891..b7fbf575045 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader): class PyPDFLoader(BasePDFLoader): - """Load PDF using pypdf into list of documents. - - Loader chunks by page and stores page numbers in metadata. """ + PyPDFLoader document loader integration + + Setup: + Install ``langchain-community``. + + .. code-block:: bash + + pip install -U langchain-community + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import PyPDFLoader + + loader = PyPDFLoader( + file_path = "./example_data/layout-parser-paper.pdf", + password = "my-pasword", + extract_images = True, + # headers = None + # extraction_mode = "plain", + # extraction_kwargs = None, + ) + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + LayoutParser : A Unified Toolkit for Deep + Learning Based Document Image Analysis + Zejiang Shen1( ), R + {'source': './example_data/layout-parser-paper.pdf', 'page': 0} + + # TODO: Delete if async load is not implemented + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + LayoutParser : A Unified Toolkit for Deep + Learning Based Document Image Analysis + Zejiang Shen1( ), R + {'source': './example_data/layout-parser-paper.pdf', 'page': 0} + """ # noqa: E501 def __init__( self, diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index 2d77222aec5..3eab7f351a1 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict: class WebBaseLoader(BaseLoader): - """Load HTML pages using `urllib` and parse them with `BeautifulSoup'.""" + """ + WebBaseLoader document loader integration + + Setup: + Install ``langchain_community``. + + .. code-block:: bash + + pip install -U langchain_community + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import WebBaseLoader + + loader = WebBaseLoader( + web_path = "https://www.espn.com/" + # header_template = None, + # verify_ssl = True, + # proxies = None, + # continue_on_failure = False, + # autoset_encoding = True, + # encoding = None, + # web_paths = (), + # requests_per_second = 2, + # default_parser = "html.parser", + # requests_kwargs = None, + # raise_for_status = False, + # bs_get_text_kwargs = None, + # bs_kwargs = None, + # session = None, + # show_progress = True, + ) + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + ESPN - Serving Sports Fans. Anytime. Anywhere. + + {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'} + + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + ESPN - Serving Sports Fans. Anytime. Anywhere. + + {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'} + + """ # noqa: E501 def __init__( self, diff --git a/libs/partners/unstructured/langchain_unstructured/document_loaders.py b/libs/partners/unstructured/langchain_unstructured/document_loaders.py index 39452aa2c6b..bd3bcd6dbbe 100644 --- a/libs/partners/unstructured/langchain_unstructured/document_loaders.py +++ b/libs/partners/unstructured/langchain_unstructured/document_loaders.py @@ -24,29 +24,9 @@ _DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general" class UnstructuredLoader(BaseLoader): """Unstructured document loader interface. - Partition and load files using either the `unstructured-client` sdk and the - Unstructured API or locally using the `unstructured` library. - - API: - This package is configured to work with the Unstructured API by default. - To use the Unstructured API, set - `partition_via_api=True` and define `api_key`. If you are running the unstructured - API locally, you can change the API rule by defining `url` when you initialize the - loader. The hosted Unstructured API requires an API key. See the links below to - learn more about our API offerings and get an API key. - - Local: - To partition files locally, you must have the `unstructured` package installed. - You can install it with `pip install unstructured`. - By default the file loader uses the Unstructured `partition` function and will - automatically detect the file type. - - In addition to document specific partition parameters, Unstructured has a rich set - of "chunking" parameters for post-processing elements into more useful text segments - for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional - Unstructured kwargs to the loader to configure different unstructured settings. - Setup: + Install ``langchain-unstructured`` and set environment variable ``UNSTRUCTURED_API_KEY``. + .. code-block:: bash pip install -U langchain-unstructured export UNSTRUCTURED_API_KEY="your-api-key" @@ -63,20 +43,46 @@ class UnstructuredLoader(BaseLoader): strategy="fast", ) - Load: + Lazy load: .. code-block:: python - docs = loader.load() + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) + .. code-block:: python + + 1 2 0 2 + {'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'} + + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + 1 2 0 2 + {'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'} + + References ---------- https://docs.unstructured.io/api-reference/api-services/sdk https://docs.unstructured.io/api-reference/api-services/overview https://docs.unstructured.io/open-source/core-functionality/partitioning https://docs.unstructured.io/open-source/core-functionality/chunking - """ + """ # noqa: E501 def __init__( self,