From 3cb536919335ef7a50a5fe48b2e423bb05d28db5 Mon Sep 17 00:00:00 2001 From: Laura Gutierrez Funderburk Date: Sun, 27 Oct 2024 13:51:58 -0700 Subject: [PATCH] clean up readme chapter 5 --- ch5/jupyter-notebooks/custom-components.ipynb | 459 ++++++++++++------ ch5/jupyter-notebooks/custom_components.ipynb | 322 ------------ 2 files changed, 317 insertions(+), 464 deletions(-) delete mode 100644 ch5/jupyter-notebooks/custom_components.ipynb diff --git a/ch5/jupyter-notebooks/custom-components.ipynb b/ch5/jupyter-notebooks/custom-components.ipynb index 482bed4..2e7303c 100644 --- a/ch5/jupyter-notebooks/custom-components.ipynb +++ b/ch5/jupyter-notebooks/custom-components.ipynb @@ -1,147 +1,322 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building custom components with Haystack\n", - "\n", - "Whereas the Haystack library provides a wide range of pre-built components, it is also possible to build custom components. This notebook demonstrates how to build a custom component for Haystack.\n", - "\n", - "The custom component we will build is a simple one: a component that takes a list of strings as input and returns the number of words in each string. This is a simple example, but it demonstrates the basic principles of building a custom component." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from typing import List\n", - "from haystack import component, Pipeline\n", - "\n", - "@component\n", - "class WelcomeTextGenerator:\n", - " \"\"\"\n", - " A component generating personal welcome message and making it upper case\n", - " \"\"\"\n", - " @component.output_types(welcome_text=str, note=str)\n", - " def run(self, name:str):\n", - " return {\"welcome_text\": ('Hello {name}, welcome to Haystack!'.format(name=name)).upper(),\n", - " \"note\": \"welcome message is ready\"}\n", - "\n", - "@component\n", - "class WhitespaceSplitter:\n", - " \"\"\"\n", - " A component for splitting the text by whitespace\n", - " \"\"\"\n", - " @component.output_types(splitted_text=List[str])\n", - " def run(self, text:str):\n", - " return {\"splitted_text\": text.split()}\n", - " \n", - "from haystack import Pipeline\n", - "text_pipeline = Pipeline()\n", - "text_pipeline.add_component(name=\"welcome_text_generator\", instance= WelcomeTextGenerator())\n", - "text_pipeline.add_component(name=\"splitter\", instance= WhitespaceSplitter())\n", - "\n", - "text_pipeline.connect(sender=\"welcome_text_generator.welcome_text\", receiver=\"splitter.text\")\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "text_pipeline.draw(\"./images/text_pipeline.png\", engine='mermaid-image')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['HELLO', 'JOHN', 'DOE,', 'WELCOME', 'TO', 'HAYSTACK!']\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "9CuvGdJNr8iV" + }, + "source": [ + "## Building custom components with Haystack\n", + "\n", + "Whereas the Haystack library provides a wide range of pre-built components, it is also possible to build custom components. This notebook demonstrates how to build a custom component for Haystack.\n", + "\n", + "The custom component we will build is a simple one: a component that takes a list of strings as input and returns the number of words in each string. This is a simple example, but it demonstrates the basic principles of building a custom component." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "onSVL7sIr9Qr", + "outputId": "eb8a631b-7774-4e9a-b783-47570707d03b" + }, + "outputs": [], + "source": [ + "!pip install haystack-ai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i8B0p9Hlr8iZ", + "outputId": "ce1a1e18-8610-4221-867f-e2e444981a7a" + }, + "outputs": [], + "source": [ + "from typing import List\n", + "from haystack import component, Pipeline\n", + "\n", + "@component\n", + "class WelcomeTextGenerator:\n", + " \"\"\"\n", + " A component generating personal welcome message and making it upper case\n", + " \"\"\"\n", + " @component.output_types(welcome_text=str, note=str)\n", + " def run(self, name:str):\n", + " return {\"welcome_text\": ('Hello {name}, welcome to Haystack!'.format(name=name)).upper(),\n", + " \"note\": \"welcome message is ready\"}\n", + "\n", + "@component\n", + "class WhitespaceSplitter:\n", + " \"\"\"\n", + " A component for splitting the text by whitespace\n", + " \"\"\"\n", + " @component.output_types(splitted_text=List[str])\n", + " def run(self, text:str):\n", + " return {\"splitted_text\": text.split()}\n", + "\n", + "from haystack import Pipeline\n", + "text_pipeline = Pipeline()\n", + "text_pipeline.add_component(name=\"welcome_text_generator\", instance= WelcomeTextGenerator())\n", + "text_pipeline.add_component(name=\"splitter\", instance= WhitespaceSplitter())\n", + "\n", + "text_pipeline.connect(sender=\"welcome_text_generator.welcome_text\", receiver=\"splitter.text\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "n2jcD_bZr8ib" + }, + "outputs": [], + "source": [ + "text_pipeline.draw(\"./text_pipeline.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JCpDPg3Br8ic", + "outputId": "21ad4098-e23b-42c3-8819-839d72457ea8" + }, + "outputs": [], + "source": [ + "result = text_pipeline.run({\"welcome_text_generator\":{\"name\": \"John Doe\"}})\n", + "\n", + "print(result[\"splitter\"][\"splitted_text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_KeNCCL4r8ic", + "outputId": "beebbc9d-e8b8-4a17-932b-dd5060268b48" + }, + "outputs": [], + "source": [ + "result['welcome_text_generator']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DGeMki1Qr8id" + }, + "source": [ + "## Incorporating custom components with existing components in a pipeline\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uPLoKITd3Bue" + }, + "source": [ + "### 1 Define custom component" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "nEy8aKIJr8id" + }, + "outputs": [], + "source": [ + "from haystack import component, Document\n", + "from typing import Any, Dict, List, Optional, Union\n", + "from haystack.dataclasses import ByteStream\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.document_stores.types import DuplicatePolicy\n", + "\n", + "from haystack.components.preprocessors import DocumentCleaner\n", + "from haystack.components.preprocessors import DocumentSplitter\n", + "from haystack.components.writers import DocumentWriter\n", + "\n", + "@component\n", + "class ParseHTML:\n", + "\n", + " @component.output_types(documents=List[Document])\n", + " def run(self, sources: Dict[str, Any]) -> None:\n", + "\n", + " documents = []\n", + " for source in sources:\n", + "\n", + " for key in source:\n", + " if type(source[key]) == str:\n", + " source[key] = self.clean_text(source[key])\n", + "\n", + " if source['content'] == \"\":\n", + " continue\n", + "\n", + " #drop content from source dictionary\n", + " content = source['content']\n", + " document = Document(content=content, meta=source)\n", + "\n", + " documents.append(document)\n", + "\n", + " return {\"documents\": documents}\n", + "\n", + " def clean_text(self, text):\n", + " # Remove HTML tags using BeautifulSoup\n", + " soup = BeautifulSoup(text, \"html.parser\")\n", + " text = soup.get_text()\n", + " # Remove extra whitespace\n", + " text = re.sub(r'\\s+', ' ', text).strip()\n", + " return text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sbrVpTyj2-1a" + }, + "source": [ + "### 2 Initialize components" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "D3x1jmba2Xuc" + }, + "outputs": [], + "source": [ + "parse_html = ParseHTML()\n", + "\n", + "document_store = InMemoryDocumentStore()\n", + "\n", + "document_cleaner = DocumentCleaner(\n", + "\n", + " remove_empty_lines=True,\n", + "\n", + " remove_extra_whitespaces=True,\n", + "\n", + " remove_repeated_substrings=False)\n", + "\n", + "document_splitter = DocumentSplitter(split_by=\"passage\", split_length=5)\n", + "\n", + "document_writer = DocumentWriter(\n", + "\n", + " document_store=document_store,\n", + "\n", + " policy = DuplicatePolicy.OVERWRITE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OQ4JUPAi39Sf" + }, + "source": [ + "### 3 Add components to the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "FABhL1tx3_m2" + }, + "outputs": [], + "source": [ + "pipeline = Pipeline()\n", + "\n", + "pipeline.add_component( \"parse_html\", parse_html)\n", + "\n", + "pipeline.add_component( \"document_cleaner\", document_cleaner)\n", + "\n", + "pipeline.add_component( \"document_splitter\", document_splitter)\n", + "\n", + "pipeline.add_component( \"document_writer\", document_writer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ws_ABuya26Uh" + }, + "source": [ + "### 4 Connect components to one another" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "17wGlzKI2ncq", + "outputId": "4550a470-d37e-48f7-881a-7209d8915965" + }, + "outputs": [], + "source": [ + "# Connect components to one another\n", + "\n", + "pipeline.connect(\"parse_html\", \"document_cleaner\")\n", + "\n", + "pipeline.connect(\"document_cleaner\", \"document_splitter\")\n", + "\n", + "pipeline.connect(\"document_splitter\", \"document_writer\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "vRVyG2Ae3D8r" + }, + "outputs": [], + "source": [ + "pipeline.draw(\"./custom_component_pipeline.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6eKZuVC13Rf5" + }, + "outputs": [], + "source": [] } - ], - "source": [ - "result = text_pipeline.run({\"welcome_text_generator\":{\"name\": \"John Doe\"}})\n", - "\n", - "print(result[\"splitter\"][\"splitted_text\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'note': 'welcome message is ready'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "llm-pipelines", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" } - ], - "source": [ - "result['welcome_text_generator']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Incorporating custom components into an indexing pipeline\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "llm-pipelines", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/ch5/jupyter-notebooks/custom_components.ipynb b/ch5/jupyter-notebooks/custom_components.ipynb deleted file mode 100644 index 2e7303c..0000000 --- a/ch5/jupyter-notebooks/custom_components.ipynb +++ /dev/null @@ -1,322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "9CuvGdJNr8iV" - }, - "source": [ - "## Building custom components with Haystack\n", - "\n", - "Whereas the Haystack library provides a wide range of pre-built components, it is also possible to build custom components. This notebook demonstrates how to build a custom component for Haystack.\n", - "\n", - "The custom component we will build is a simple one: a component that takes a list of strings as input and returns the number of words in each string. This is a simple example, but it demonstrates the basic principles of building a custom component." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "onSVL7sIr9Qr", - "outputId": "eb8a631b-7774-4e9a-b783-47570707d03b" - }, - "outputs": [], - "source": [ - "!pip install haystack-ai" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "i8B0p9Hlr8iZ", - "outputId": "ce1a1e18-8610-4221-867f-e2e444981a7a" - }, - "outputs": [], - "source": [ - "from typing import List\n", - "from haystack import component, Pipeline\n", - "\n", - "@component\n", - "class WelcomeTextGenerator:\n", - " \"\"\"\n", - " A component generating personal welcome message and making it upper case\n", - " \"\"\"\n", - " @component.output_types(welcome_text=str, note=str)\n", - " def run(self, name:str):\n", - " return {\"welcome_text\": ('Hello {name}, welcome to Haystack!'.format(name=name)).upper(),\n", - " \"note\": \"welcome message is ready\"}\n", - "\n", - "@component\n", - "class WhitespaceSplitter:\n", - " \"\"\"\n", - " A component for splitting the text by whitespace\n", - " \"\"\"\n", - " @component.output_types(splitted_text=List[str])\n", - " def run(self, text:str):\n", - " return {\"splitted_text\": text.split()}\n", - "\n", - "from haystack import Pipeline\n", - "text_pipeline = Pipeline()\n", - "text_pipeline.add_component(name=\"welcome_text_generator\", instance= WelcomeTextGenerator())\n", - "text_pipeline.add_component(name=\"splitter\", instance= WhitespaceSplitter())\n", - "\n", - "text_pipeline.connect(sender=\"welcome_text_generator.welcome_text\", receiver=\"splitter.text\")\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "n2jcD_bZr8ib" - }, - "outputs": [], - "source": [ - "text_pipeline.draw(\"./text_pipeline.png\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JCpDPg3Br8ic", - "outputId": "21ad4098-e23b-42c3-8819-839d72457ea8" - }, - "outputs": [], - "source": [ - "result = text_pipeline.run({\"welcome_text_generator\":{\"name\": \"John Doe\"}})\n", - "\n", - "print(result[\"splitter\"][\"splitted_text\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_KeNCCL4r8ic", - "outputId": "beebbc9d-e8b8-4a17-932b-dd5060268b48" - }, - "outputs": [], - "source": [ - "result['welcome_text_generator']" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DGeMki1Qr8id" - }, - "source": [ - "## Incorporating custom components with existing components in a pipeline\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uPLoKITd3Bue" - }, - "source": [ - "### 1 Define custom component" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "nEy8aKIJr8id" - }, - "outputs": [], - "source": [ - "from haystack import component, Document\n", - "from typing import Any, Dict, List, Optional, Union\n", - "from haystack.dataclasses import ByteStream\n", - "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", - "from haystack.document_stores.types import DuplicatePolicy\n", - "\n", - "from haystack.components.preprocessors import DocumentCleaner\n", - "from haystack.components.preprocessors import DocumentSplitter\n", - "from haystack.components.writers import DocumentWriter\n", - "\n", - "@component\n", - "class ParseHTML:\n", - "\n", - " @component.output_types(documents=List[Document])\n", - " def run(self, sources: Dict[str, Any]) -> None:\n", - "\n", - " documents = []\n", - " for source in sources:\n", - "\n", - " for key in source:\n", - " if type(source[key]) == str:\n", - " source[key] = self.clean_text(source[key])\n", - "\n", - " if source['content'] == \"\":\n", - " continue\n", - "\n", - " #drop content from source dictionary\n", - " content = source['content']\n", - " document = Document(content=content, meta=source)\n", - "\n", - " documents.append(document)\n", - "\n", - " return {\"documents\": documents}\n", - "\n", - " def clean_text(self, text):\n", - " # Remove HTML tags using BeautifulSoup\n", - " soup = BeautifulSoup(text, \"html.parser\")\n", - " text = soup.get_text()\n", - " # Remove extra whitespace\n", - " text = re.sub(r'\\s+', ' ', text).strip()\n", - " return text" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sbrVpTyj2-1a" - }, - "source": [ - "### 2 Initialize components" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "D3x1jmba2Xuc" - }, - "outputs": [], - "source": [ - "parse_html = ParseHTML()\n", - "\n", - "document_store = InMemoryDocumentStore()\n", - "\n", - "document_cleaner = DocumentCleaner(\n", - "\n", - " remove_empty_lines=True,\n", - "\n", - " remove_extra_whitespaces=True,\n", - "\n", - " remove_repeated_substrings=False)\n", - "\n", - "document_splitter = DocumentSplitter(split_by=\"passage\", split_length=5)\n", - "\n", - "document_writer = DocumentWriter(\n", - "\n", - " document_store=document_store,\n", - "\n", - " policy = DuplicatePolicy.OVERWRITE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OQ4JUPAi39Sf" - }, - "source": [ - "### 3 Add components to the pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "FABhL1tx3_m2" - }, - "outputs": [], - "source": [ - "pipeline = Pipeline()\n", - "\n", - "pipeline.add_component( \"parse_html\", parse_html)\n", - "\n", - "pipeline.add_component( \"document_cleaner\", document_cleaner)\n", - "\n", - "pipeline.add_component( \"document_splitter\", document_splitter)\n", - "\n", - "pipeline.add_component( \"document_writer\", document_writer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ws_ABuya26Uh" - }, - "source": [ - "### 4 Connect components to one another" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "17wGlzKI2ncq", - "outputId": "4550a470-d37e-48f7-881a-7209d8915965" - }, - "outputs": [], - "source": [ - "# Connect components to one another\n", - "\n", - "pipeline.connect(\"parse_html\", \"document_cleaner\")\n", - "\n", - "pipeline.connect(\"document_cleaner\", \"document_splitter\")\n", - "\n", - "pipeline.connect(\"document_splitter\", \"document_writer\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "vRVyG2Ae3D8r" - }, - "outputs": [], - "source": [ - "pipeline.draw(\"./custom_component_pipeline.png\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6eKZuVC13Rf5" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "llm-pipelines", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}