Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a6dbf6a

Browse files
Add notebook
1 parent f6da528 commit a6dbf6a

File tree

1 file changed

+313
-0
lines changed

1 file changed

+313
-0
lines changed

‎cv_load_CatBoost.ipynb‎

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from collections import Counter\n",
10+
"\n",
11+
"dtype = {\n",
12+
" 'Usage': 'category',\n",
13+
" 'Description': 'category',\n",
14+
" 'status': 'category',\n",
15+
"}\n",
16+
"df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])\n",
17+
"df.drop(columns='Usage', inplace=True)\n",
18+
"\n",
19+
"df.loc[df['cancel_date'] == '0001年01月01日T00:00:00', ['cancel_date']] = None\n",
20+
"df['cancel_date'] = pd.to_datetime(df['cancel_date'])\n",
21+
"\n",
22+
"df['arrival_year'] = df['arrival'].dt.year"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 2,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"appearances = {}\n",
32+
"for contract_id in df['contract_id'].unique():\n",
33+
" subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])\n",
34+
" # save the last known state\n",
35+
" # fill in blanks for bad years\n",
36+
" # handle case where cancel year might come after a series of misses\n",
37+
" yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}\n",
38+
" earliest = subset_df['arrival_year'].min()\n",
39+
" latest = min(subset_df['arrival_year'].max(), 2019)\n",
40+
" activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)] \n",
41+
" if activity:\n",
42+
" resort_id = subset_df['resort_id'].values[0]\n",
43+
" appearances[str(contract_id)] = [resort_id] + activity"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 3,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"rows = []\n",
53+
"for r in list(appearances.values()):\n",
54+
" resort_id, activity = r[0], r[1:]\n",
55+
" row = [None] * 5\n",
56+
" row[-len(activity):] = [s for year, s in activity]\n",
57+
" rows.append([resort_id] + row)"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 4,
63+
"metadata": {},
64+
"outputs": [],
65+
"source": [
66+
"df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 5,
72+
"metadata": {},
73+
"outputs": [
74+
{
75+
"data": {
76+
"text/plain": [
77+
"resort_id 0.000000\n",
78+
"year_2015 0.515846\n",
79+
"year_2016 0.205276\n",
80+
"year_2017 0.098678\n",
81+
"year_2018 0.048311\n",
82+
"year_2019 0.000000\n",
83+
"dtype: float64"
84+
]
85+
},
86+
"execution_count": 5,
87+
"metadata": {},
88+
"output_type": "execute_result"
89+
}
90+
],
91+
"source": [
92+
"df.isnull().sum() / df.shape[0]"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": 6,
98+
"metadata": {},
99+
"outputs": [],
100+
"source": [
101+
"df.fillna('missing', inplace=True)"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": 9,
107+
"metadata": {},
108+
"outputs": [
109+
{
110+
"data": {
111+
"text/plain": [
112+
"resort_id 57\n",
113+
"year_2015 4\n",
114+
"year_2016 4\n",
115+
"year_2017 4\n",
116+
"year_2018 4\n",
117+
"year_2019 3\n",
118+
"dtype: int64"
119+
]
120+
},
121+
"execution_count": 9,
122+
"metadata": {},
123+
"output_type": "execute_result"
124+
}
125+
],
126+
"source": [
127+
"df.nunique()"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": 11,
133+
"metadata": {},
134+
"outputs": [],
135+
"source": [
136+
"X = df.drop(\"year_2019\", axis=1)\n",
137+
"y = df[\"year_2019\"]"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 12,
143+
"metadata": {},
144+
"outputs": [
145+
{
146+
"name": "stdout",
147+
"output_type": "stream",
148+
"text": [
149+
"[0, 1, 2, 3, 4]\n"
150+
]
151+
}
152+
],
153+
"source": [
154+
"cat_features = list(range(0, X.shape[1]))\n",
155+
"print(cat_features)"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": 13,
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"from sklearn.model_selection import train_test_split\n",
165+
"\n",
166+
"\n",
167+
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": 14,
173+
"metadata": {},
174+
"outputs": [
175+
{
176+
"name": "stdout",
177+
"output_type": "stream",
178+
"text": [
179+
"0:\tlearn: 0.9089395\ttest: 0.9088238\tbest: 0.9088238 (0)\ttotal: 83.9ms\tremaining: 755ms\n",
180+
"5:\tlearn: 0.6921539\ttest: 0.6930650\tbest: 0.6930650 (5)\ttotal: 219ms\tremaining: 146ms\n",
181+
"9:\tlearn: 0.6811982\ttest: 0.6819978\tbest: 0.6819978 (9)\ttotal: 319ms\tremaining: 0us\n",
182+
"\n",
183+
"bestTest = 0.6819978385\n",
184+
"bestIteration = 9\n",
185+
"\n"
186+
]
187+
},
188+
{
189+
"data": {
190+
"text/plain": [
191+
"<catboost.core.CatBoostClassifier at 0x7f2c8c533390>"
192+
]
193+
},
194+
"execution_count": 14,
195+
"metadata": {},
196+
"output_type": "execute_result"
197+
}
198+
],
199+
"source": [
200+
"from catboost import CatBoostClassifier\n",
201+
"\n",
202+
"clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)\n",
203+
"clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 15,
209+
"metadata": {},
210+
"outputs": [],
211+
"source": [
212+
"y_prob = clf.predict_proba(data=X_val)\n",
213+
"y_pred = clf.predict(data=X_val)"
214+
]
215+
},
216+
{
217+
"cell_type": "code",
218+
"execution_count": 16,
219+
"metadata": {},
220+
"outputs": [
221+
{
222+
"name": "stdout",
223+
"output_type": "stream",
224+
"text": [
225+
"Accuracy: 0.7176913425345044\n"
226+
]
227+
}
228+
],
229+
"source": [
230+
"from sklearn.metrics import accuracy_score\n",
231+
"\n",
232+
"accuracy = accuracy_score(y_val, y_pred)\n",
233+
"print('Accuracy:', accuracy)"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": 17,
239+
"metadata": {},
240+
"outputs": [
241+
{
242+
"name": "stdout",
243+
"output_type": "stream",
244+
"text": [
245+
"AUC: 0.8310445038554471\n"
246+
]
247+
}
248+
],
249+
"source": [
250+
"from sklearn.metrics import roc_auc_score\n",
251+
"\n",
252+
"auc = roc_auc_score(y_val, y_prob, multi_class=\"ovo\", average=\"macro\")\n",
253+
"print('AUC:', auc)"
254+
]
255+
},
256+
{
257+
"cell_type": "code",
258+
"execution_count": 18,
259+
"metadata": {},
260+
"outputs": [
261+
{
262+
"name": "stdout",
263+
"output_type": "stream",
264+
"text": [
265+
" precision recall f1-score support\n",
266+
"\n",
267+
" active 0.75 0.87 0.80 11582\n",
268+
" cancelled 0.63 0.52 0.57 5562\n",
269+
" no-show 0.73 0.52 0.61 3578\n",
270+
"\n",
271+
" accuracy 0.72 20722\n",
272+
" macro avg 0.70 0.64 0.66 20722\n",
273+
"weighted avg 0.71 0.72 0.71 20722\n",
274+
"\n"
275+
]
276+
}
277+
],
278+
"source": [
279+
"from sklearn.metrics import classification_report\n",
280+
"\n",
281+
"print(classification_report(y_val, y_pred))"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"metadata": {},
288+
"outputs": [],
289+
"source": []
290+
}
291+
],
292+
"metadata": {
293+
"kernelspec": {
294+
"display_name": "Python 3",
295+
"language": "python",
296+
"name": "python3"
297+
},
298+
"language_info": {
299+
"codemirror_mode": {
300+
"name": "ipython",
301+
"version": 3
302+
},
303+
"file_extension": ".py",
304+
"mimetype": "text/x-python",
305+
"name": "python",
306+
"nbconvert_exporter": "python",
307+
"pygments_lexer": "ipython3",
308+
"version": "3.6.7"
309+
}
310+
},
311+
"nbformat": 4,
312+
"nbformat_minor": 2
313+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /