{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cb26d669-80c7-4132-9a79-7cf6680538e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv())\n",
    "openai_api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1217b85-bda5-48a2-a733-3040a1ee38e0",
   "metadata": {},
   "source": [
    "## Basic app for QA a a code library or a github repository"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "020a520a-08b8-423b-bbf6-6bc8fb316769",
   "metadata": {},
   "source": [
    "**Load the github repo**\n",
    "<br>\n",
    "We will load the code of the github repo The Fuzz, a small python module for string matching."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d08285d1-544d-4cd0-a8d8-94939364fc62",
   "metadata": {},
   "outputs": [],
   "source": [
    "root_dir = \"data/thefuzz-master\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5dbf3eca-fba9-402d-af47-d031cb4006d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "document_chunks = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "55d4b606-2f7b-471b-a319-5e66a4898a57",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import TextLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e8843188-e15e-48d0-8fa8-7cb7a347f727",
   "metadata": {},
   "outputs": [],
   "source": [
    "for dirpath, dirnames, filenames in os.walk(root_dir):\n",
    "    for file in filenames:\n",
    "        try:\n",
    "            loader = TextLoader(\n",
    "                os.path.join(dirpath, file),\n",
    "                encoding=\"utf-8\"\n",
    "            )\n",
    "            document_chunks.extend(loader.load_and_split())\n",
    "        except Exception as e:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7ca1dc28-3ca5-4f69-a342-eb509370045c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We have 170 chunks.\n"
     ]
    }
   ],
   "source": [
    "print(f\"We have {len(document_chunks)} chunks.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c7e32912-94b7-41eb-99e1-7cf8d44818bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "import unittest\n",
      "import re\n",
      "import pycodestyle\n",
      "\n",
      "from thefuzz import fuzz\n",
      "from thefuzz import process\n",
      "from thefuzz import utils\n",
      "\n",
      "scorers = [\n",
      "    fuzz.ratio,\n",
      "    fuzz.partial_ratio,\n",
      "    fuzz.token_sort_ratio,\n",
      "    fuzz.token_set_ratio,\n",
      "    fuzz.partial_token_sort_ratio,\n",
      "    fuzz.partial_token_set_ratio,\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(document_chunks[0].page_content[:300])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9713e800-cc92-45f2-a8c1-dc374d24461a",
   "metadata": {},
   "source": [
    "**Convert the text chunks in embeddings and store them in a vector database**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cb255432-d1b0-4711-b5f9-f2ab8c7e84cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.vectorstores import FAISS\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9368b26b-0506-4a37-84f5-83e145780efb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.embeddings.openai.OpenAIEmbeddings` was deprecated in langchain-community 0.1.0 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ba2daa7c-f7f4-47e6-938a-61739856ca6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "stored_embeddings = FAISS.from_documents(document_chunks, embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebe95a1e-e319-4502-b352-bddb102bb7ff",
   "metadata": {},
   "source": [
    "**Create the RetrievalQA chain**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e0fdba87-5765-4d50-b57f-7e6a8fbe6d66",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import ChatOpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cbdc197b-e4b3-420c-979e-22da1831c6c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "chat_model = ChatOpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "554e2f41-e258-446e-bff1-19f94b5d2991",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.chains import RetrievalQA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "54e20784-cde2-45e8-824e-09eb2b02aa41",
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm=chat_model,\n",
    "    chain_type=\"stuff\",\n",
    "    retriever=stored_embeddings.as_retriever()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86e8c251-35da-48fe-8e10-45846a2520ff",
   "metadata": {},
   "source": [
    "**Now we can make questions about the github library**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "673e2df3-f652-4f08-a03b-21b23539ca17",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"\"\"\n",
    "What function do I use if I want to find \n",
    "the most similar item in a list of items?\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "bf32dec0-3e20-49f2-967f-2c0203684281",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/juliocolomer/.pyenv/versions/3.11.4/envs/venv020124/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "answer = qa_chain.run(question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2b4a0e66-b122-4d01-bb49-24a572b28cb3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You can use the `process.extractOne()` function from the `thefuzz` library to find the most similar item in a list of items. This function takes a query string and a list of choices, and it returns a tuple containing the best match and its similarity score. Here's an example of how to use it:\n",
      "\n",
      "```python\n",
      "from thefuzz import process\n",
      "\n",
      "choices = [\"apple\", \"banana\", \"orange\"]\n",
      "query = \"aple\"\n",
      "\n",
      "best_match = process.extractOne(query, choices)\n",
      "print(best_match)\n",
      "```\n",
      "\n",
      "Output:\n",
      "```\n",
      "('apple', 80)\n",
      "```\n",
      "\n",
      "In this example, the best match for the query \"aple\" is \"apple\" with a similarity score of 80.\n"
     ]
    }
   ],
   "source": [
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b08200e-114e-4249-af10-e41855921fde",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
