Commit a6dbf6a

susanli2016 susanli2016

authored

Add notebook

1 parent f6da528 commit a6dbf6aCopy full SHA for a6dbf6a

File tree

1 file changed

+313

-0

lines changed

cv_load_CatBoost.ipynb

1 file changed

+313

-0

lines changed

`‎cv_load_CatBoost.ipynb‎`

Lines changed: 313 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,313 @@`
	`1`	`+{`
	`2`	`+ "cells": [`
	`3`	`+ {`
	`4`	`+ "cell_type": "code",`
	`5`	`+ "execution_count": 1,`
	`6`	`+ "metadata": {},`
	`7`	`+ "outputs": [],`
	`8`	`+ "source": [`
	`9`	`+ "from collections import Counter\n",`
	`10`	`+ "\n",`
	`11`	`+ "dtype = {\n",`
	`12`	`+ " 'Usage': 'category',\n",`
	`13`	`+ " 'Description': 'category',\n",`
	`14`	`+ " 'status': 'category',\n",`
	`15`	`+ "}\n",`
	`16`	`+ "df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])\n",`
	`17`	`+ "df.drop(columns='Usage', inplace=True)\n",`
	`18`	`+ "\n",`
	`19`	`+ "df.loc[df['cancel_date'] == '0001年01月01日T00:00:00', ['cancel_date']] = None\n",`
	`20`	`+ "df['cancel_date'] = pd.to_datetime(df['cancel_date'])\n",`
	`21`	`+ "\n",`
	`22`	`+ "df['arrival_year'] = df['arrival'].dt.year"`
	`23`	`+ ]`
	`24`	`+ },`
	`25`	`+ {`
	`26`	`+ "cell_type": "code",`
	`27`	`+ "execution_count": 2,`
	`28`	`+ "metadata": {},`
	`29`	`+ "outputs": [],`
	`30`	`+ "source": [`
	`31`	`+ "appearances = {}\n",`
	`32`	`+ "for contract_id in df['contract_id'].unique():\n",`
	`33`	`+ " subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])\n",`
	`34`	`+ " # save the last known state\n",`
	`35`	`+ " # fill in blanks for bad years\n",`
	`36`	`+ " # handle case where cancel year might come after a series of misses\n",`
	`37`	`+ " yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}\n",`
	`38`	`+ " earliest = subset_df['arrival_year'].min()\n",`
	`39`	`+ " latest = min(subset_df['arrival_year'].max(), 2019)\n",`
	`40`	`+ " activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)] \n",`
	`41`	`+ " if activity:\n",`
	`42`	`+ " resort_id = subset_df['resort_id'].values[0]\n",`
	`43`	`+ " appearances[str(contract_id)] = [resort_id] + activity"`
	`44`	`+ ]`
	`45`	`+ },`
	`46`	`+ {`
	`47`	`+ "cell_type": "code",`
	`48`	`+ "execution_count": 3,`
	`49`	`+ "metadata": {},`
	`50`	`+ "outputs": [],`
	`51`	`+ "source": [`
	`52`	`+ "rows = []\n",`
	`53`	`+ "for r in list(appearances.values()):\n",`
	`54`	`+ " resort_id, activity = r[0], r[1:]\n",`
	`55`	`+ " row = [None] * 5\n",`
	`56`	`+ " row[-len(activity):] = [s for year, s in activity]\n",`
	`57`	`+ " rows.append([resort_id] + row)"`
	`58`	`+ ]`
	`59`	`+ },`
	`60`	`+ {`
	`61`	`+ "cell_type": "code",`
	`62`	`+ "execution_count": 4,`
	`63`	`+ "metadata": {},`
	`64`	`+ "outputs": [],`
	`65`	`+ "source": [`
	`66`	`+ "df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])"`
	`67`	`+ ]`
	`68`	`+ },`
	`69`	`+ {`
	`70`	`+ "cell_type": "code",`
	`71`	`+ "execution_count": 5,`
	`72`	`+ "metadata": {},`
	`73`	`+ "outputs": [`
	`74`	`+ {`
	`75`	`+ "data": {`
	`76`	`+ "text/plain": [`
	`77`	`+ "resort_id 0.000000\n",`
	`78`	`+ "year_2015 0.515846\n",`
	`79`	`+ "year_2016 0.205276\n",`
	`80`	`+ "year_2017 0.098678\n",`
	`81`	`+ "year_2018 0.048311\n",`
	`82`	`+ "year_2019 0.000000\n",`
	`83`	`+ "dtype: float64"`
	`84`	`+ ]`
	`85`	`+ },`
	`86`	`+ "execution_count": 5,`
	`87`	`+ "metadata": {},`
	`88`	`+ "output_type": "execute_result"`
	`89`	`+ }`
	`90`	`+ ],`
	`91`	`+ "source": [`
	`92`	`+ "df.isnull().sum() / df.shape[0]"`
	`93`	`+ ]`
	`94`	`+ },`
	`95`	`+ {`
	`96`	`+ "cell_type": "code",`
	`97`	`+ "execution_count": 6,`
	`98`	`+ "metadata": {},`
	`99`	`+ "outputs": [],`
	`100`	`+ "source": [`
	`101`	`+ "df.fillna('missing', inplace=True)"`
	`102`	`+ ]`
	`103`	`+ },`
	`104`	`+ {`
	`105`	`+ "cell_type": "code",`
	`106`	`+ "execution_count": 9,`
	`107`	`+ "metadata": {},`
	`108`	`+ "outputs": [`
	`109`	`+ {`
	`110`	`+ "data": {`
	`111`	`+ "text/plain": [`
	`112`	`+ "resort_id 57\n",`
	`113`	`+ "year_2015 4\n",`
	`114`	`+ "year_2016 4\n",`
	`115`	`+ "year_2017 4\n",`
	`116`	`+ "year_2018 4\n",`
	`117`	`+ "year_2019 3\n",`
	`118`	`+ "dtype: int64"`
	`119`	`+ ]`
	`120`	`+ },`
	`121`	`+ "execution_count": 9,`
	`122`	`+ "metadata": {},`
	`123`	`+ "output_type": "execute_result"`
	`124`	`+ }`
	`125`	`+ ],`
	`126`	`+ "source": [`
	`127`	`+ "df.nunique()"`
	`128`	`+ ]`
	`129`	`+ },`
	`130`	`+ {`
	`131`	`+ "cell_type": "code",`
	`132`	`+ "execution_count": 11,`
	`133`	`+ "metadata": {},`
	`134`	`+ "outputs": [],`
	`135`	`+ "source": [`
	`136`	`+ "X = df.drop(\"year_2019\", axis=1)\n",`
	`137`	`+ "y = df[\"year_2019\"]"`
	`138`	`+ ]`
	`139`	`+ },`
	`140`	`+ {`
	`141`	`+ "cell_type": "code",`
	`142`	`+ "execution_count": 12,`
	`143`	`+ "metadata": {},`
	`144`	`+ "outputs": [`
	`145`	`+ {`
	`146`	`+ "name": "stdout",`
	`147`	`+ "output_type": "stream",`
	`148`	`+ "text": [`
	`149`	`+ "[0, 1, 2, 3, 4]\n"`
	`150`	`+ ]`
	`151`	`+ }`
	`152`	`+ ],`
	`153`	`+ "source": [`
	`154`	`+ "cat_features = list(range(0, X.shape[1]))\n",`
	`155`	`+ "print(cat_features)"`
	`156`	`+ ]`
	`157`	`+ },`
	`158`	`+ {`
	`159`	`+ "cell_type": "code",`
	`160`	`+ "execution_count": 13,`
	`161`	`+ "metadata": {},`
	`162`	`+ "outputs": [],`
	`163`	`+ "source": [`
	`164`	`+ "from sklearn.model_selection import train_test_split\n",`
	`165`	`+ "\n",`
	`166`	`+ "\n",`
	`167`	`+ "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)"`
	`168`	`+ ]`
	`169`	`+ },`
	`170`	`+ {`
	`171`	`+ "cell_type": "code",`
	`172`	`+ "execution_count": 14,`
	`173`	`+ "metadata": {},`
	`174`	`+ "outputs": [`
	`175`	`+ {`
	`176`	`+ "name": "stdout",`
	`177`	`+ "output_type": "stream",`
	`178`	`+ "text": [`
	`179`	`+ "0:\tlearn: 0.9089395\ttest: 0.9088238\tbest: 0.9088238 (0)\ttotal: 83.9ms\tremaining: 755ms\n",`
	`180`	`+ "5:\tlearn: 0.6921539\ttest: 0.6930650\tbest: 0.6930650 (5)\ttotal: 219ms\tremaining: 146ms\n",`
	`181`	`+ "9:\tlearn: 0.6811982\ttest: 0.6819978\tbest: 0.6819978 (9)\ttotal: 319ms\tremaining: 0us\n",`
	`182`	`+ "\n",`
	`183`	`+ "bestTest = 0.6819978385\n",`
	`184`	`+ "bestIteration = 9\n",`
	`185`	`+ "\n"`
	`186`	`+ ]`
	`187`	`+ },`
	`188`	`+ {`
	`189`	`+ "data": {`
	`190`	`+ "text/plain": [`
	`191`	`+ "<catboost.core.CatBoostClassifier at 0x7f2c8c533390>"`
	`192`	`+ ]`
	`193`	`+ },`
	`194`	`+ "execution_count": 14,`
	`195`	`+ "metadata": {},`
	`196`	`+ "output_type": "execute_result"`
	`197`	`+ }`
	`198`	`+ ],`
	`199`	`+ "source": [`
	`200`	`+ "from catboost import CatBoostClassifier\n",`
	`201`	`+ "\n",`
	`202`	`+ "clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)\n",`
	`203`	`+ "clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))"`
	`204`	`+ ]`
	`205`	`+ },`
	`206`	`+ {`
	`207`	`+ "cell_type": "code",`
	`208`	`+ "execution_count": 15,`
	`209`	`+ "metadata": {},`
	`210`	`+ "outputs": [],`
	`211`	`+ "source": [`
	`212`	`+ "y_prob = clf.predict_proba(data=X_val)\n",`
	`213`	`+ "y_pred = clf.predict(data=X_val)"`
	`214`	`+ ]`
	`215`	`+ },`
	`216`	`+ {`
	`217`	`+ "cell_type": "code",`
	`218`	`+ "execution_count": 16,`
	`219`	`+ "metadata": {},`
	`220`	`+ "outputs": [`
	`221`	`+ {`
	`222`	`+ "name": "stdout",`
	`223`	`+ "output_type": "stream",`
	`224`	`+ "text": [`
	`225`	`+ "Accuracy: 0.7176913425345044\n"`
	`226`	`+ ]`
	`227`	`+ }`
	`228`	`+ ],`
	`229`	`+ "source": [`
	`230`	`+ "from sklearn.metrics import accuracy_score\n",`
	`231`	`+ "\n",`
	`232`	`+ "accuracy = accuracy_score(y_val, y_pred)\n",`
	`233`	`+ "print('Accuracy:', accuracy)"`
	`234`	`+ ]`
	`235`	`+ },`
	`236`	`+ {`
	`237`	`+ "cell_type": "code",`
	`238`	`+ "execution_count": 17,`
	`239`	`+ "metadata": {},`
	`240`	`+ "outputs": [`
	`241`	`+ {`
	`242`	`+ "name": "stdout",`
	`243`	`+ "output_type": "stream",`
	`244`	`+ "text": [`
	`245`	`+ "AUC: 0.8310445038554471\n"`
	`246`	`+ ]`
	`247`	`+ }`
	`248`	`+ ],`
	`249`	`+ "source": [`
	`250`	`+ "from sklearn.metrics import roc_auc_score\n",`
	`251`	`+ "\n",`
	`252`	`+ "auc = roc_auc_score(y_val, y_prob, multi_class=\"ovo\", average=\"macro\")\n",`
	`253`	`+ "print('AUC:', auc)"`
	`254`	`+ ]`
	`255`	`+ },`
	`256`	`+ {`
	`257`	`+ "cell_type": "code",`
	`258`	`+ "execution_count": 18,`
	`259`	`+ "metadata": {},`
	`260`	`+ "outputs": [`
	`261`	`+ {`
	`262`	`+ "name": "stdout",`
	`263`	`+ "output_type": "stream",`
	`264`	`+ "text": [`
	`265`	`+ " precision recall f1-score support\n",`
	`266`	`+ "\n",`
	`267`	`+ " active 0.75 0.87 0.80 11582\n",`
	`268`	`+ " cancelled 0.63 0.52 0.57 5562\n",`
	`269`	`+ " no-show 0.73 0.52 0.61 3578\n",`
	`270`	`+ "\n",`
	`271`	`+ " accuracy 0.72 20722\n",`
	`272`	`+ " macro avg 0.70 0.64 0.66 20722\n",`
	`273`	`+ "weighted avg 0.71 0.72 0.71 20722\n",`
	`274`	`+ "\n"`
	`275`	`+ ]`
	`276`	`+ }`
	`277`	`+ ],`
	`278`	`+ "source": [`
	`279`	`+ "from sklearn.metrics import classification_report\n",`
	`280`	`+ "\n",`
	`281`	`+ "print(classification_report(y_val, y_pred))"`
	`282`	`+ ]`
	`283`	`+ },`
	`284`	`+ {`
	`285`	`+ "cell_type": "code",`
	`286`	`+ "execution_count": null,`
	`287`	`+ "metadata": {},`
	`288`	`+ "outputs": [],`
	`289`	`+ "source": []`
	`290`	`+ }`
	`291`	`+ ],`
	`292`	`+ "metadata": {`
	`293`	`+ "kernelspec": {`
	`294`	`+ "display_name": "Python 3",`
	`295`	`+ "language": "python",`
	`296`	`+ "name": "python3"`
	`297`	`+ },`
	`298`	`+ "language_info": {`
	`299`	`+ "codemirror_mode": {`
	`300`	`+ "name": "ipython",`
	`301`	`+ "version": 3`
	`302`	`+ },`
	`303`	`+ "file_extension": ".py",`
	`304`	`+ "mimetype": "text/x-python",`
	`305`	`+ "name": "python",`
	`306`	`+ "nbconvert_exporter": "python",`
	`307`	`+ "pygments_lexer": "ipython3",`
	`308`	`+ "version": "3.6.7"`
	`309`	`+ }`
	`310`	`+ },`
	`311`	`+ "nbformat": 4,`
	`312`	`+ "nbformat_minor": 2`
	`313`	`+}`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit a6dbf6a

File tree

1 file changed

1 file changed

`‎cv_load_CatBoost.ipynb‎`

0 commit comments