{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "05c0410e-ca39-4f0f-ba37-80e8033bfbb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv())\n",
    "openai_api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87cd4b20-ad2d-42d4-9c97-ef472838c010",
   "metadata": {},
   "source": [
    "## Basic RAG app with the vector database DeepLake"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1e4e83e-44e6-4524-a9a9-f68666497f7b",
   "metadata": {},
   "source": [
    "**Load de DeepLake credentials**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c4509169-f508-44d6-9265-524b71c68387",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"ACTIVELOOP_TOKEN\"] = os.environ[\"DEEPLAKE_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ac79b622-8135-4710-8c9b-60c2f07e63e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_activeloop_org_id = os.environ[\"ACTIVELOOP_ORG_ID\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceb10059-dc8f-42ee-b91c-1674c05459b6",
   "metadata": {},
   "source": [
    "**Name the new database you will create**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bb7a9ef8-7d25-4c91-855d-3b23f4c5ca92",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_activeloop_dataset_name = \"basic-rag-with-deeplake\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b42b12ce-0e99-4817-85b3-a1c2b092ad91",
   "metadata": {},
   "source": [
    "**Load dependencies**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fe4b2edd-2f0a-46cf-ae41-3cf0d97795bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ea36f679-dbeb-4041-8239-53f65a26de83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.vectorstores import DeepLake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "499516cf-c75b-4d45-ae0d-fd7a4689f40d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6cc0d0f5-5719-46ab-865f-cd783bcf2131",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.chains import RetrievalQA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af50d1dd-de83-4c58-a484-85c8982595f8",
   "metadata": {},
   "source": [
    "**Create the external knowledge document**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1d36ca48-8516-4036-8042-9fc8ead22f22",
   "metadata": {},
   "outputs": [],
   "source": [
    "usa_curious_facts = [\n",
    "    \"\"\"\n",
    "    The US celebrates Independence Day from the British Empire \n",
    "    on July 4. However, the country’s Declaration of Independence \n",
    "    was passed on July 2. It was only officially ratified on July 4.\n",
    "    \"\"\",\n",
    "    \"\"\"\n",
    "    The very first documented European to arrive in North America was \n",
    "    the Spaniard Juan Ponce de León, who landed in Florida in 1513.\n",
    "    \"\"\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "958f9cc6-45b0-4e4e-8e6d-ce7ecae9f4aa",
   "metadata": {},
   "source": [
    "**Divide the document in smaller chunks of text**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ca514159-94e8-4ff0-ab02-594000a33dc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size = 1000,\n",
    "    chunk_overlap = 0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d32a76da-9bb7-4b74-94d8-238e3dc16a12",
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_chunks = text_splitter.create_documents(usa_curious_facts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4d25ad15-beff-4fa7-a067-a86bb5ea8504",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Now you have 2 chunks.\n"
     ]
    }
   ],
   "source": [
    "print(f\"Now you have {len(doc_chunks)} chunks.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d32ca2f-d440-469b-88ef-c61e2d4e1947",
   "metadata": {},
   "source": [
    "**Create the DeepLake vector database**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "25fd277b-77d6-454e-be75-b89991ac7e46",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.embeddings.openai.OpenAIEmbeddings` was deprecated in langchain-community 0.1.0 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6b11ed2c-cce9-4026-a547-6ded85d76514",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_path = f\"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "eae8f4e5-35c6-4637-80df-7af705e3bb79",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install \"deeplake[enterprise]\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9c999347-1e69-4bc1-a6d8-0adaa8ae0a8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Deep Lake Dataset in hub://julio4ai/basic-rag-with-deeplake already exists, loading from the storage\n"
     ]
    }
   ],
   "source": [
    "db = DeepLake(\n",
    "    dataset_path=dataset_path,\n",
    "    embedding=embeddings\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5eecd57-e3bc-4927-bf46-713fe4b75194",
   "metadata": {},
   "source": [
    "**Load the chunks, will transformed into embeddings**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fbbe4c81-cdb5-452b-8bf1-bb92d564fa32",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])\n",
      "\n",
      "  tensor      htype      shape      dtype  compression\n",
      "  -------    -------    -------    -------  ------- \n",
      " embedding  embedding  (14, 1536)  float32   None   \n",
      "    id        text      (14, 1)      str     None   \n",
      " metadata     json      (14, 1)      str     None   \n",
      "   text       text      (14, 1)      str     None   \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['99bda362-c12d-11ee-b5ba-1e00d92e2031',\n",
       " '99bda506-c12d-11ee-b5ba-1e00d92e2031']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.add_documents(doc_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47b6286e-58ce-4188-9ccb-a1b6e5445b92",
   "metadata": {},
   "source": [
    "**Create the QA Chain**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c948087d-f9dd-4933-b5da-70700961d0c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "33324744-cfe2-4949-84fb-d4fc2d7ac3a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "dc2913de-99c1-46e8-966d-f87c8a668c44",
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm=llm,\n",
    "    chain_type=\"stuff\",\n",
    "    retriever=db.as_retriever()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "224637a8-ccec-4e55-b982-a931068527f7",
   "metadata": {},
   "source": [
    "**Ask the App about the document**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "64093d02-8d47-488d-9403-5fcb5d5984e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "' The Declaration of Independence was passed on July 2, but it was officially ratified on July 4.'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"When was actually passed the U.S. Declaration of Independence?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68fda902-f60a-4a8c-8116-09d98e38a969",
   "metadata": {},
   "source": [
    "**Add new data to the vector database**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "83a63276-d1cb-4a5f-85b8-d80a361e7374",
   "metadata": {},
   "outputs": [],
   "source": [
    "additional_usa_curious_facts = [\n",
    "    \"\"\"\n",
    "    Alaska is the largest state in the US, and used to belong \n",
    "    to the Russian Empire before the US purchased it.\n",
    "    \"\"\",\n",
    "    \"\"\"\n",
    "    Big cities and regions have their own style of pizza: Chicago \n",
    "    Deep-Dish, New York Style, Detroit Pizza, St Louis-Style, and \n",
    "    New England Beach Pizza are just a few different varieties.\n",
    "    \"\"\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c25b3f6e-8c41-4815-9240-d74ae7f025c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "additional_doc_chunks = text_splitter.create_documents(additional_usa_curious_facts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "62551482-a191-4c51-9166-656a0996e5fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])\n",
      "\n",
      "  tensor      htype      shape      dtype  compression\n",
      "  -------    -------    -------    -------  ------- \n",
      " embedding  embedding  (16, 1536)  float32   None   \n",
      "    id        text      (16, 1)      str     None   \n",
      " metadata     json      (16, 1)      str     None   \n",
      "   text       text      (16, 1)      str     None   \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['aa98554c-c12d-11ee-b5ba-1e00d92e2031',\n",
       " 'aa9855ba-c12d-11ee-b5ba-1e00d92e2031']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.add_documents(additional_doc_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c53d1d60-0860-4293-ae7d-4b9ca01a9d45",
   "metadata": {},
   "source": [
    "**Ask the app about the new data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "dd40247f-74c2-4505-a57d-5dd93b79635b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' Alaska'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"What is the largest state in the US?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "458b29fc-1467-4155-addb-b8549205435e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' Illinois, New York, and Michigan'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_chain.run(\"Tell me 3 states with their own style of pizza\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2865a99a-68a1-432f-88ce-da5facaaa526",
   "metadata": {},
   "source": [
    "## Similarity Search"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ad971c9-6016-46c4-8449-46b620f89a18",
   "metadata": {},
   "source": [
    "The following is not working after the last update. Will check and report back ASAP."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "16c54126-c090-48f9-8fbf-96329748039e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#db.similarity_search_with_score(\"What is the largest state in the US?\", k=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6a69c6a-fcb9-47ec-bcd3-a91dfe2f9a53",
   "metadata": {},
   "source": [
    "## Retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "280ac818-dcd1-46b1-8514-7cee429164e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = db.as_retriever(search_kwargs={\"k\": 1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "c47bd208-f4e4-4d09-8ff6-6c9ac51a5a55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Alaska is the largest state in the US, and used to belong \\n    to the Russian Empire before the US purchased it.')]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever.get_relevant_documents(query=\"What is the largest state in the US?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4a6b34e-4c35-45d8-987e-50cdd2c3fc05",
   "metadata": {},
   "source": [
    "## Indexing API\n",
    "In case you need to update the contents of your vector database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "101edec6-0aff-4e5f-806f-dbff3952d30a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.indexes import SQLRecordManager, index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "1f3fb9c3-37d7-40cc-837a-ca9deec51600",
   "metadata": {},
   "outputs": [],
   "source": [
    "# record_manager = SQLRecordManager(\n",
    "#     namespace,\n",
    "#     db_url\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "53f746b3-b914-4a1e-8bd3-0cb4142032d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# record_manager.create_schema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "4444fd06-3f13-4a63-8454-1b39904efb3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# index(\n",
    "#     docs,\n",
    "#     record_manager,\n",
    "#     vectorstore,\n",
    "#     cleanup=None,\n",
    "#     source_id_key=\"source\",\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "4dbdecc9-2a06-461c-b53e-4646d46134c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from langchain.schema import Document\n",
    "\n",
    "# docs[1].page_content = \"updated\"\n",
    "\n",
    "# del docs[6]\n",
    "\n",
    "# docs.append(Document(\n",
    "#     page_content=\"new content\", \n",
    "#     metadata={\"source\": \"important\"}\n",
    "# ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "dbb1bfea-6fae-46a1-8995-8d987ef32713",
   "metadata": {},
   "outputs": [],
   "source": [
    "# index(\n",
    "#     docs,\n",
    "#     record_manager,\n",
    "#     vectorstore,\n",
    "#     cleanup=None,\n",
    "#     source_id_key=\"source\",\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6acfaca0-bc26-4f05-8f32-064fb1bfb233",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
