{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "05c0410e-ca39-4f0f-ba37-80e8033bfbb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv())\n",
    "openai_api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87cd4b20-ad2d-42d4-9c97-ef472838c010",
   "metadata": {},
   "source": [
    "## Basic RAG app with the vector database DeepLake"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1e4e83e-44e6-4524-a9a9-f68666497f7b",
   "metadata": {},
   "source": [
    "**Load de DeepLake credentials**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c4509169-f508-44d6-9265-524b71c68387",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"ACTIVELOOP_TOKEN\"] = os.environ[\"DEEPLAKE_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ac79b622-8135-4710-8c9b-60c2f07e63e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_activeloop_org_id = os.environ[\"ACTIVELOOP_ORG_ID\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceb10059-dc8f-42ee-b91c-1674c05459b6",
   "metadata": {},
   "source": [
    "**Name the new database you will create**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bb7a9ef8-7d25-4c91-855d-3b23f4c5ca92",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_activeloop_dataset_name = \"basic-rag-with-deeplake\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b42b12ce-0e99-4817-85b3-a1c2b092ad91",
   "metadata": {},
   "source": [
    "**Load dependencies**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fe4b2edd-2f0a-46cf-ae41-3cf0d97795bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ea36f679-dbeb-4041-8239-53f65a26de83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.vectorstores import DeepLake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "499516cf-c75b-4d45-ae0d-fd7a4689f40d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6cc0d0f5-5719-46ab-865f-cd783bcf2131",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.chains import RetrievalQA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af50d1dd-de83-4c58-a484-85c8982595f8",
   "metadata": {},
   "source": [
    "**Create the external knowledge document**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1d36ca48-8516-4036-8042-9fc8ead22f22",
   "metadata": {},
   "outputs": [],
   "source": [
    "usa_curious_facts = [\n",
    "    \"\"\"\n",
    "    The US celebrates Independence Day from the British Empire \n",
    "    on July 4. However, the country’s Declaration of Independence \n",
    "    was passed on July 2. It was only officially ratified on July 4.\n",
    "    \"\"\",\n",
    "    \"\"\"\n",
    "    The very first documented European to arrive in North America was \n",
    "    the Spaniard Juan Ponce de León, who landed in Florida in 1513.\n",
    "    \"\"\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "958f9cc6-45b0-4e4e-8e6d-ce7ecae9f4aa",
   "metadata": {},
   "source": [
    "**Divide the document in smaller chunks of text**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ca514159-94e8-4ff0-ab02-594000a33dc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size = 1000,\n",
    "    chunk_overlap = 0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d32a76da-9bb7-4b74-94d8-238e3dc16a12",
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_chunks = text_splitter.create_documents(usa_curious_facts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4d25ad15-beff-4fa7-a067-a86bb5ea8504",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Now you have 2 chunks.\n"
     ]
    }
   ],
   "source": [
    "print(f\"Now you have {len(doc_chunks)} chunks.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d32ca2f-d440-469b-88ef-c61e2d4e1947",
   "metadata": {},
   "source": [
    "**Create the DeepLake vector database**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "25fd277b-77d6-454e-be75-b89991ac7e46",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.embeddings.openai.OpenAIEmbeddings` was deprecated in langchain-community 0.1.0 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6b11ed2c-cce9-4026-a547-6ded85d76514",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_path = f\"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "eae8f4e5-35c6-4637-80df-7af705e3bb79",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install \"deeplake[enterprise]\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9c999347-1e69-4bc1-a6d8-0adaa8ae0a8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Deep Lake Dataset in hub://julio4ai/basic-rag-with-deeplake already exists, loading from the storage\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/"
     ]
    }
   ],
   "source": [
    "db = DeepLake(\n",
    "    dataset_path=dataset_path,\n",
    "    embedding=embeddings\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5eecd57-e3bc-4927-bf46-713fe4b75194",
   "metadata": {},
   "source": [
    "**Load the chunks, will transformed into embeddings**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fbbe4c81-cdb5-452b-8bf1-bb92d564fa32",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])\n",
      "\n",
      "  tensor      htype      shape      dtype  compression\n",
      "  -------    -------    -------    -------  ------- \n",
      " embedding  embedding  (10, 1536)  float32   None   \n",
      "    id        text      (10, 1)      str     None   \n",
      " metadata     json      (10, 1)      str     None   \n",
      "   text       text      (10, 1)      str     None   \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['c58da3a8-c122-11ee-8703-1e00d92e2031',\n",
       " 'c58da4b6-c122-11ee-8703-1e00d92e2031']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.add_documents(doc_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47b6286e-58ce-4188-9ccb-a1b6e5445b92",
   "metadata": {},
   "source": [
    "**Create the QA Chain**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c948087d-f9dd-4933-b5da-70700961d0c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "33324744-cfe2-4949-84fb-d4fc2d7ac3a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "dc2913de-99c1-46e8-966d-f87c8a668c44",
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm=llm,\n",
    "    chain_type=\"stuff\",\n",
    "    retriever=db.as_retriever()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "224637a8-ccec-4e55-b982-a931068527f7",
   "metadata": {},
   "source": [
    "**Ask the App about the document**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "64093d02-8d47-488d-9403-5fcb5d5984e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "' The Declaration of Independence was passed on July 2, but officially ratified on July 4.'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"When was actually passed the U.S. Declaration of Independence?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68fda902-f60a-4a8c-8116-09d98e38a969",
   "metadata": {},
   "source": [
    "**Add new data to the vector database**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "83a63276-d1cb-4a5f-85b8-d80a361e7374",
   "metadata": {},
   "outputs": [],
   "source": [
    "additional_usa_curious_facts = [\n",
    "    \"\"\"\n",
    "    Alaska is the largest state in the US, and used to belong \n",
    "    to the Russian Empire before the US purchased it.\n",
    "    \"\"\",\n",
    "    \"\"\"\n",
    "    Big cities and regions have their own style of pizza: Chicago \n",
    "    Deep-Dish, New York Style, Detroit Pizza, St Louis-Style, and \n",
    "    New England Beach Pizza are just a few different varieties.\n",
    "    \"\"\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c25b3f6e-8c41-4815-9240-d74ae7f025c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "additional_doc_chunks = text_splitter.create_documents(additional_usa_curious_facts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "62551482-a191-4c51-9166-656a0996e5fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])\n",
      "\n",
      "  tensor      htype      shape      dtype  compression\n",
      "  -------    -------    -------    -------  ------- \n",
      " embedding  embedding  (12, 1536)  float32   None   \n",
      "    id        text      (12, 1)      str     None   \n",
      " metadata     json      (12, 1)      str     None   \n",
      "   text       text      (12, 1)      str     None   \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['0a1db684-c123-11ee-8703-1e00d92e2031',\n",
       " '0a1db76a-c123-11ee-8703-1e00d92e2031']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.add_documents(additional_doc_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c53d1d60-0860-4293-ae7d-4b9ca01a9d45",
   "metadata": {},
   "source": [
    "**Ask the app about the new data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "dd40247f-74c2-4505-a57d-5dd93b79635b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' Alaska'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"What is the largest state in the US?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "458b29fc-1467-4155-addb-b8549205435e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nChicago, New York, and Detroit'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"Tell me 3 states with their own style of pizza\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16c54126-c090-48f9-8fbf-96329748039e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
