Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a5a64d6

Browse files
committed
Add Using functions to clean data notes
1 parent 6d83888 commit a5a64d6

File tree

1 file changed

+86
-9
lines changed

1 file changed

+86
-9
lines changed

‎Cleaning_Data_in_Python/Cleaning_Data_for_Analysis.ipynb

Lines changed: 86 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -188,21 +188,98 @@
188188
"metadata": {},
189189
"outputs": [
190190
{
191-
"ename": "TypeError",
192-
"evalue": "findall() missing 1 required positional argument: 'string'",
193-
"output_type": "error",
194-
"traceback": [
195-
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
196-
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
197-
"\u001b[1;32m<ipython-input-5-d636d1f14eb0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Find the numeric values in a string\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmatches\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfindall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'\\d*'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
198-
"\u001b[1;31mTypeError\u001b[0m: findall() missing 1 required positional argument: 'string'"
191+
"name": "stdout",
192+
"output_type": "stream",
193+
"text": [
194+
"['10', '3']\n"
199195
]
200196
}
201197
],
202198
"source": [
203199
"# Find the numeric values in a string\n",
204-
"matches = re.findall('\\d*')"
200+
"matches = re.findall('\\d+', 'The Recipe calls for 10 apples and 3 bananas') \n",
201+
"print(matches)"
202+
]
203+
},
204+
{
205+
"cell_type": "code",
206+
"execution_count": 6,
207+
"metadata": {},
208+
"outputs": [
209+
{
210+
"name": "stdout",
211+
"output_type": "stream",
212+
"text": [
213+
"American\n"
214+
]
215+
}
216+
],
217+
"source": [
218+
"# Find the capital letter, followed by any number of alphanumeric characters, for the word\n",
219+
"result = re.match(pattern='[A-Z]\\w*', string='American Eagle 1977')\n",
220+
"print(result.group())"
221+
]
222+
},
223+
{
224+
"cell_type": "markdown",
225+
"metadata": {},
226+
"source": [
227+
"## Using Functions to Clean Data\n",
228+
"\n",
229+
"**Complex Cleaning**\n",
230+
"* Cleaning step requires multiple steps\n",
231+
" * IE: Extract number from string\n",
232+
" * Perform transformation on extracted number\n",
233+
"* Python Functions can be used\n",
234+
"\n",
235+
"**Example 1**"
205236
]
237+
},
238+
{
239+
"cell_type": "code",
240+
"execution_count": 7,
241+
"metadata": {},
242+
"outputs": [],
243+
"source": [
244+
"def calculate_tip_percentage_of_total_bill(row):\n",
245+
" tip_amount = row['tip']\n",
246+
" total_bill = row['total_bill']\n",
247+
" tip_percentage_of_total_bill = (tip_amount / total_bill) * 100\n",
248+
" return tip_percentage_of_total_bill"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": 8,
254+
"metadata": {},
255+
"outputs": [
256+
{
257+
"name": "stdout",
258+
"output_type": "stream",
259+
"text": [
260+
" total_bill tip sex smoker day time size tip_percentage\n",
261+
"0 16.99 1.01 Female True Sun Dinner 2 5.944673\n",
262+
"1 10.34 1.66 Male True Sun Dinner 3 16.054159\n",
263+
"2 21.01 3.50 Male True Sun Dinner 3 16.658734\n",
264+
"3 23.68 3.31 Male True Sun Dinner 2 13.978041\n",
265+
"4 24.59 3.61 Female True Sun Dinner 4 14.680765\n"
266+
]
267+
}
268+
],
269+
"source": [
270+
"# Note: By default .apply() will work column-wise as 'axis=0',\n",
271+
"# 'axis=1' specifies running it row-wise\n",
272+
"df['tip_percentage'] = df.apply(calculate_tip_percentage_of_total_bill,\n",
273+
" axis=1)\n",
274+
"print(df.head())"
275+
]
276+
},
277+
{
278+
"cell_type": "code",
279+
"execution_count": null,
280+
"metadata": {},
281+
"outputs": [],
282+
"source": []
206283
}
207284
],
208285
"metadata": {

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /