{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: xgboost in c:\\programdata\\anaconda3\\lib\\site-packages (1.3.1)\n",
      "Requirement already satisfied: scipy in c:\\programdata\\anaconda3\\lib\\site-packages (from xgboost) (1.5.0)\n",
      "Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from xgboost) (1.18.5)\n"
     ]
    }
   ],
   "source": [
    "!pip install xgboost\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from xgboost.sklearn import XGBClassifier\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_montly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>left</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>sales</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.42</td>\n",
       "      <td>0.46</td>\n",
       "      <td>2</td>\n",
       "      <td>150</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.66</td>\n",
       "      <td>0.77</td>\n",
       "      <td>2</td>\n",
       "      <td>171</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>technical</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.55</td>\n",
       "      <td>0.49</td>\n",
       "      <td>5</td>\n",
       "      <td>240</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>technical</td>\n",
       "      <td>high</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.22</td>\n",
       "      <td>0.88</td>\n",
       "      <td>4</td>\n",
       "      <td>213</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>technical</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.20</td>\n",
       "      <td>0.72</td>\n",
       "      <td>6</td>\n",
       "      <td>224</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>technical</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   satisfaction_level  last_evaluation  number_project  average_montly_hours  \\\n",
       "0                0.42             0.46               2                   150   \n",
       "1                0.66             0.77               2                   171   \n",
       "2                0.55             0.49               5                   240   \n",
       "3                0.22             0.88               4                   213   \n",
       "4                0.20             0.72               6                   224   \n",
       "\n",
       "   time_spend_company  Work_accident  left  promotion_last_5years      sales  \\\n",
       "0                   3              0     1                      0      sales   \n",
       "1                   2              0     0                      0  technical   \n",
       "2                   3              0     0                      0  technical   \n",
       "3                   3              1     0                      0  technical   \n",
       "4                   4              0     1                      0  technical   \n",
       "\n",
       "   salary  \n",
       "0  medium  \n",
       "1  medium  \n",
       "2    high  \n",
       "3  medium  \n",
       "4  medium  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data prep from previous module\n",
    "file1=r'hr_train.csv'\n",
    "\n",
    "ci=pd.read_csv(file1)\n",
    "ci.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    7424\n",
       "1    3075\n",
       "Name: left, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ci.left.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.414308943089431"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "7424/3075"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sales', 'salary']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_cols=ci.select_dtypes(['object']).columns\n",
    "cat_cols = ['sales', 'salary']\n",
    "cat_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sales\n",
      "salary\n"
     ]
    }
   ],
   "source": [
    "for col in cat_cols:\n",
    "    freqs=ci[col].value_counts()\n",
    "    selected_cats=freqs.index[freqs>500][:-1]\n",
    "    \n",
    "    print(col)\n",
    "    for cat in selected_cats:\n",
    "        name=col+'_'+cat\n",
    "        \n",
    "        ci[name]=(ci[col]==cat).astype(int)\n",
    "    del ci[col]\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10499, 18)\n"
     ]
    }
   ],
   "source": [
    "print(ci.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['satisfaction_level', 'last_evaluation', 'number_project',\n",
       "       'average_montly_hours', 'time_spend_company', 'Work_accident',\n",
       "       'promotion_last_5years', 'sales_sales', 'sales_technical',\n",
       "       'sales_support', 'sales_IT', 'sales_product_mng', 'sales_marketing',\n",
       "       'sales_hr', 'sales_RandD', 'salary_low', 'salary_medium'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train=ci.drop('left',axis=1)\n",
    "y_train=ci['left']\n",
    "x_train.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "del ci"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def report(results, n_top=3):\n",
    "    for i in range(1, n_top + 1):\n",
    "        candidates = np.flatnonzero(results['rank_test_score'] == i)\n",
    "        for candidate in candidates:\n",
    "            print(\"Model with rank: {0}\".format(i))\n",
    "            print(\"Mean validation score: {0:.3f} (std: {1:.5f})\".format(\n",
    "                  results['mean_test_score'][candidate],\n",
    "                  results['std_test_score'][candidate]))\n",
    "            print(\"Parameters: {0}\".format(results['params'][candidate]))\n",
    "            print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params = {  \n",
    "                \"learning_rate\":[0.01,0.05,0.1,0.3,0.5],\n",
    "                \"gamma\":[i/10.0 for i in range(0,5)],\n",
    "                \"max_depth\": [2,3,4,5,6,7,8],\n",
    "                \"min_child_weight\":[1,2,5,10],\n",
    "                \"max_delta_step\":[0,1,2,5,10],\n",
    "                \"subsample\":[i/10.0 for i in range(5,10)],\n",
    "                \"colsample_bytree\":[i/10.0 for i in range(5,10)],\n",
    "                \"colsample_bylevel\":[i/10.0 for i in range(5,10)],\n",
    "                \"reg_lambda\":[1e-5, 1e-2, 0.1, 1, 100], \n",
    "                \"reg_alpha\":[1e-5, 1e-2, 0.1, 1, 100],\n",
    "                \"scale_pos_weight\":[1,2,3,4,5,6,7,8,9],\n",
    "                \"n_estimators\":[100,500,700,1000]\n",
    "             }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb=XGBClassifier(objective='binary:logistic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_iter=10\n",
    "\n",
    "random_search=RandomizedSearchCV(xgb,n_jobs=-1,cv=5,n_iter=n_iter,scoring='roc_auc',scale_pos_weight=2.41,\n",
    "                                 param_distributions=xgb_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=5, error_score=nan,\n",
       "                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',\n",
       "                                           colsample_bylevel=1,\n",
       "                                           colsample_bynode=1,\n",
       "                                           colsample_bytree=1, gamma=0,\n",
       "                                           learning_rate=0.1, max_delta_step=0,\n",
       "                                           max_depth=3, min_child_weight=1,\n",
       "                                           missing=None, n_estimators=100,\n",
       "                                           n_jobs=1, nthread=None,\n",
       "                                           objective='binary:logistic',\n",
       "                                           random_state=0, reg_alpha=0,\n",
       "                                           reg_lambda=1, sc...\n",
       "                                        'max_delta_step': [0, 1, 2, 5, 10],\n",
       "                                        'max_depth': [2, 3, 4, 5, 6, 7, 8],\n",
       "                                        'min_child_weight': [1, 2, 5, 10],\n",
       "                                        'n_estimators': [100, 500, 700, 1000],\n",
       "                                        'reg_alpha': [1e-05, 0.01, 0.1, 1, 100],\n",
       "                                        'reg_lambda': [1e-05, 0.01, 0.1, 1,\n",
       "                                                       100],\n",
       "                                        'scale_pos_weight': [1, 2, 3, 4, 5, 6,\n",
       "                                                             7, 8, 9],\n",
       "                                        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]},\n",
       "                   pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "                   return_train_score=False, scoring='roc_auc', verbose=0)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.837 (std: 0.01914)\n",
      "Parameters: {'subsample': 0.7, 'scale_pos_weight': 2, 'reg_lambda': 1e-05, 'reg_alpha': 0.1, 'n_estimators': 700, 'min_child_weight': 10, 'max_depth': 4, 'max_delta_step': 10, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}\n",
      "\n",
      "Model with rank: 2\n",
      "Mean validation score: 0.837 (std: 0.01955)\n",
      "Parameters: {'subsample': 0.7, 'scale_pos_weight': 4, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 700, 'min_child_weight': 2, 'max_depth': 4, 'max_delta_step': 0, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5}\n",
      "\n",
      "Model with rank: 3\n",
      "Mean validation score: 0.836 (std: 0.01806)\n",
      "Parameters: {'subsample': 0.5, 'scale_pos_weight': 1, 'reg_lambda': 1e-05, 'reg_alpha': 0.01, 'n_estimators': 1000, 'min_child_weight': 10, 'max_depth': 5, 'max_delta_step': 5, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.6}\n",
      "\n",
      "Model with rank: 4\n",
      "Mean validation score: 0.836 (std: 0.01979)\n",
      "Parameters: {'subsample': 0.8, 'scale_pos_weight': 9, 'reg_lambda': 100, 'reg_alpha': 0.1, 'n_estimators': 1000, 'min_child_weight': 5, 'max_depth': 3, 'max_delta_step': 5, 'learning_rate': 0.01, 'gamma': 0.4, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.5}\n",
      "\n",
      "Model with rank: 5\n",
      "Mean validation score: 0.833 (std: 0.01646)\n",
      "Parameters: {'subsample': 0.9, 'scale_pos_weight': 7, 'reg_lambda': 1e-05, 'reg_alpha': 1e-05, 'n_estimators': 700, 'min_child_weight': 5, 'max_depth': 4, 'max_delta_step': 0, 'learning_rate': 0.3, 'gamma': 0.4, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.6}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(random_search.cv_results_,5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n",
       "              colsample_bynode=1, colsample_bytree=0.8, gamma=0.1,\n",
       "              learning_rate=0.01, max_delta_step=10, max_depth=4,\n",
       "              min_child_weight=10, missing=None, n_estimators=700, n_jobs=1,\n",
       "              nthread=None, objective='binary:logistic', random_state=0,\n",
       "              reg_alpha=0.1, reg_lambda=1e-05, scale_pos_weight=2, seed=None,\n",
       "              silent=None, subsample=0.7, verbosity=1)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#XGBOOST...by sequencial Param tuning\n",
    "\n",
    "xgb_params = {  \n",
    "                \"n_estimators\":[100,500,700,900,1000,1200,1500]\n",
    "             }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb1=XGBClassifier(learning_rate=0.1,subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid_search=GridSearchCV(xgb1,cv=5,param_grid=xgb_params,scoring='roc_auc',verbose=2,n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 7 candidates, totalling 35 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  2.5min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=0,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.8, verbosity=1),\n",
       "       fit_params=None, iid='warn', n_jobs=-1,\n",
       "       param_grid={'n_estimators': [100, 500, 700, 900, 1000, 1200, 1500]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='roc_auc', verbose=2)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.836 (std: 0.01713)\n",
      "Parameters: {'n_estimators': 900}\n",
      "\n",
      "Model with rank: 2\n",
      "Mean validation score: 0.836 (std: 0.01863)\n",
      "Parameters: {'n_estimators': 100}\n",
      "\n",
      "Model with rank: 3\n",
      "Mean validation score: 0.836 (std: 0.01691)\n",
      "Parameters: {'n_estimators': 700}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(grid_search.cv_results_,3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we got n_estimator=500 as best with learning_rate=0.1  . Next we'll tune max_depth,gamma and min_child_weight, which control overfit by controlling size of individual trees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params={\n",
    "            \"gamma\":[5,8,10,12,15],\n",
    "            \"max_depth\": [6,7,8,9,10,11,12],\n",
    "            #\"min_child_weight\":[0.5,1,2,5,10]\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb2=XGBClassifier(learning_rate=0.1,n_estimators=100,subsample=0.8,min_child_weight=2, colsample_bylevel=0.8,colsample_bytree=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search=RandomizedSearchCV(xgb2,param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',\n",
    "                                 n_jobs=-1,verbose=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   28.2s\n",
      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=5, error_score='raise-deprecating',\n",
       "          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=0,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.8, verbosity=1),\n",
       "          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,\n",
       "          param_distributions={'gamma': [5, 8, 10, 12, 15], 'max_depth': [6, 7, 8, 9, 10, 11, 12]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score='warn', scoring='roc_auc', verbose=2)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.841 (std: 0.01882)\n",
      "Parameters: {'max_depth': 12, 'gamma': 8}\n",
      "\n",
      "Model with rank: 2\n",
      "Mean validation score: 0.840 (std: 0.02055)\n",
      "Parameters: {'max_depth': 8, 'gamma': 12}\n",
      "\n",
      "Model with rank: 3\n",
      "Mean validation score: 0.840 (std: 0.01999)\n",
      "Parameters: {'max_depth': 10, 'gamma': 12}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(random_search.cv_results_,3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=8,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=12,\n",
       "       min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.8, verbosity=1)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.best_estimator_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we got best values for parameters being tuned as follows : {'min_child_weight': 1, 'gamma': 0, 'max_depth': 3}\n",
    "\n",
    "Since there is imbalance in the data , we'll look into max_delta_step and scale_pos_weight next"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    7424\n",
       "1.0    3075\n",
       "Name: left, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params={\n",
    "            'max_delta_step':[0,1,3,6,10],\n",
    "            'scale_pos_weight':[1,2,3,4]\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb3=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,\n",
    "                  subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid_search=GridSearchCV(xgb3,param_grid=xgb_params,cv=5,scoring='roc_auc',n_jobs=-1,verbose=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.0s\n",
      "[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.9s\n",
      "[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   20.8s\n",
      "[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.0s\n",
      "[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.5s\n",
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.4s\n",
      "[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   46.8s\n",
      "[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   53.3s\n",
      "[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.2min\n",
      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=12,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=8,\n",
       "       min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.8, verbosity=1),\n",
       "       fit_params=None, iid='warn', n_jobs=-1,\n",
       "       param_grid={'max_delta_step': [0, 1, 3, 6, 10], 'scale_pos_weight': [1, 2, 3, 4]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='roc_auc', verbose=10)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.840 (std: 0.02055)\n",
      "Parameters: {'max_delta_step': 0, 'scale_pos_weight': 1}\n",
      "\n",
      "Model with rank: 1\n",
      "Mean validation score: 0.840 (std: 0.02055)\n",
      "Parameters: {'max_delta_step': 3, 'scale_pos_weight': 1}\n",
      "\n",
      "Model with rank: 1\n",
      "Mean validation score: 0.840 (std: 0.02055)\n",
      "Parameters: {'max_delta_step': 6, 'scale_pos_weight': 1}\n",
      "\n",
      "Model with rank: 1\n",
      "Mean validation score: 0.840 (std: 0.02055)\n",
      "Parameters: {'max_delta_step': 10, 'scale_pos_weight': 1}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(grid_search.cv_results_,3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "it turns out that , since imbalance was not that severe , defaults come out as best choices {'scale_pos_weight': 1, 'max_delta_step': 0}\n",
    "\n",
    "Next we check the effect of the noise on data and tune , subsample , colsample_bytree and colsample_bylevel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params={\n",
    "            'subsample':[i/10 for i in range(5,11)],\n",
    "            'colsample_bytree':[i/10 for i in range(5,11)],\n",
    "            'colsample_bylevel':[i/10 for i in range(5,11)]\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb4=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,\n",
    "                   scale_pos_weight=1,max_delta_step=0\n",
    "                  )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search=RandomizedSearchCV(xgb4,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',\n",
    "                                n_jobs=-1,verbose=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.1s\n",
      "[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.1s\n",
      "[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    8.1s\n",
      "[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    8.2s\n",
      "[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.3s\n",
      "[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   10.3s\n",
      "[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   10.3s\n",
      "[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.4s\n",
      "[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.8s\n",
      "[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   13.9s\n",
      "[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   13.9s\n",
      "[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   13.9s\n",
      "[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   16.1s\n",
      "[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   16.1s\n",
      "[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   16.2s\n",
      "[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   16.4s\n",
      "[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   18.5s\n",
      "[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.6s\n",
      "[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:   18.8s\n",
      "[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:   18.9s\n",
      "[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   22.1s\n",
      "[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed:   22.1s\n",
      "[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:   22.3s\n",
      "[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   22.5s\n",
      "[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   24.4s\n",
      "[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   24.7s\n",
      "[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed:   24.8s\n",
      "[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   25.7s\n",
      "[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   27.6s\n",
      "[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   27.9s\n",
      "[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:   28.2s\n",
      "[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   29.1s\n",
      "[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.4s\n",
      "[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s\n",
      "[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:   30.8s\n",
      "[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:   32.1s\n",
      "[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   33.3s\n",
      "[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   33.5s\n",
      "[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:   33.7s\n",
      "[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   35.0s\n",
      "[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:   36.2s\n",
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.4s\n",
      "[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:   36.7s\n",
      "[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:   38.2s\n",
      "[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   38.8s\n",
      "[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   38.9s\n",
      "[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:   39.2s\n",
      "[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   40.3s\n",
      "[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   40.9s\n",
      "[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   41.1s\n",
      "[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:   41.6s\n",
      "[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   42.8s\n",
      "[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   43.3s\n",
      "[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   43.5s\n",
      "[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:   44.0s\n",
      "[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   45.3s\n",
      "[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   45.8s\n",
      "[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:   46.0s\n",
      "[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:   46.5s\n",
      "[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:   47.8s\n",
      "[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   48.4s\n",
      "[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:   48.6s\n",
      "[Parallel(n_jobs=-1)]: Done  63 tasks      | elapsed:   49.1s\n",
      "[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   50.4s\n",
      "[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   50.9s\n",
      "[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   51.5s\n",
      "[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:   52.0s\n",
      "[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   53.4s\n",
      "[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   53.9s\n",
      "[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:   54.6s\n",
      "[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:   55.6s\n",
      "[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:   56.9s\n",
      "[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:   57.3s\n",
      "[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   57.9s\n",
      "[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:   58.9s\n",
      "[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:   59.0s\n",
      "[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   59.4s\n",
      "[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:   60.0s\n",
      "[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:  1.0min\n",
      "[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:  1.0min\n",
      "[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.0min\n",
      "[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.0min\n",
      "[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  87 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed:  1.2min\n",
      "[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:  1.2min\n",
      "[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  1.2min\n",
      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=5, error_score='raise-deprecating',\n",
       "          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bynode=1, colsample_bytree=1, gamma=12, learning_rate=0.1,\n",
       "       max_delta_step=0, max_depth=8, min_child_weight=2, missing=None,\n",
       "       n_estimators=100, n_jobs=1, nthread=None,\n",
       "       objective='binary:logistic', random_state=0, reg_alpha=0,\n",
       "       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,\n",
       "       subsample=1, verbosity=1),\n",
       "          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,\n",
       "          param_distributions={'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score='warn', scoring='roc_auc', verbose=20)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.841 (std: 0.01786)\n",
      "Parameters: {'subsample': 0.7, 'colsample_bytree': 0.8, 'colsample_bylevel': 1.0}\n",
      "\n",
      "Model with rank: 2\n",
      "Mean validation score: 0.841 (std: 0.01984)\n",
      "Parameters: {'subsample': 1.0, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.5}\n",
      "\n",
      "Model with rank: 3\n",
      "Mean validation score: 0.840 (std: 0.02038)\n",
      "Parameters: {'subsample': 0.7, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.6}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(random_search.cv_results_,3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "bets values that we got for paramaeters are as follows : {'colsample_bylevel': 0.5, 'colsample_bytree': 0.6, 'subsample': 1.0}\n",
    "\n",
    "lastly we can work on L2 and L1 penalty on leaf node score to further reduce overfit if there is any"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb5=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,\n",
    "                   scale_pos_weight=1,max_delta_step=0,\n",
    "                   colsample_bylevel= 1.0, colsample_bytree= 0.8, subsample= 0.7\n",
    "                  )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params={\n",
    "            'reg_lambda':[i/10 for i in range(0,50)],\n",
    "            'reg_alpha':[i/10 for i in range(0,50)]\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search=RandomizedSearchCV(xgb5,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',\n",
    "                                n_jobs=-1,verbose=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.2s\n",
      "[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.4s\n",
      "[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.5s\n",
      "[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   25.9s\n",
      "[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.6s\n",
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.2s\n",
      "[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   54.9s\n",
      "[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min\n",
      "[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.3min\n",
      "[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.5min\n",
      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=5, error_score='raise-deprecating',\n",
       "          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=12,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=8,\n",
       "       min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.7, verbosity=1),\n",
       "          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,\n",
       "          param_distributions={'reg_lambda': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9], 'reg_alpha': ...3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score='warn', scoring='roc_auc', verbose=10)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_search.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model with rank: 1\n",
      "Mean validation score: 0.841 (std: 0.02121)\n",
      "Parameters: {'reg_lambda': 1.8, 'reg_alpha': 0.0}\n",
      "\n",
      "Model with rank: 2\n",
      "Mean validation score: 0.841 (std: 0.02096)\n",
      "Parameters: {'reg_lambda': 0.5, 'reg_alpha': 1.9}\n",
      "\n",
      "Model with rank: 3\n",
      "Mean validation score: 0.840 (std: 0.01993)\n",
      "Parameters: {'reg_lambda': 0.1, 'reg_alpha': 1.0}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report(random_search.cv_results_,3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The best value that we got here is {'reg_lambda': 1.5, 'reg_alpha': 0.0}, but the performance has gone down. May be the default was doing better and wasnt picked as one of the candidates here in the random_search. we'll go with those defaults values instead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb6=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,\n",
    "                   scale_pos_weight=1,max_delta_step=0,\n",
    "                   colsample_bylevel= 1.0, colsample_bytree= 0.8, subsample= 0.7,\n",
    "                  reg_lambda=1.8,reg_alpha=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we want to simply get cv performance of a model , without having to select any parameters we can make use of cross_validation_score function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,\n",
       "       colsample_bynode=1, colsample_bytree=0.8, gamma=12,\n",
       "       learning_rate=0.1, max_delta_step=0, max_depth=8,\n",
       "       min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1.8, scale_pos_weight=1, seed=None,\n",
       "       silent=None, subsample=0.7, verbosity=1)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb6.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   17.7s remaining:   17.7s\n",
      "[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   18.1s remaining:    7.7s\n",
      "[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   25.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0.83939277, 0.84522644, 0.80372   , 0.81151571, 0.8525834 ,\n",
       "       0.88518135, 0.84040185, 0.82995162, 0.82688745, 0.83156492])"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "cross_val_score(xgb6,x_train,y_train,scoring='roc_auc',verbose=10,n_jobs=-1,cv=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lambda 0.5 alpha 3.5 - mean =0.9308910909999998  std= 0.004558141295680632\n",
    "scores=[0.83939277, 0.84522644, 0.80372   , 0.81151571, 0.8525834 ,\n",
    "       0.88518135, 0.84040185, 0.82995162, 0.82688745, 0.83156492]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8366425510000001"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.021416045974485785"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.std(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
