|
614 | 614 | "cell_type": "markdown",
|
615 | 615 | "metadata": {},
|
616 | 616 | "source": [
|
617 | | - "#### Reindexing DataFrames" |
| 617 | + "## Resampling time series data\n", |
| 618 | + "\n", |
| 619 | + "* Statistical methods over different time intervals\n", |
| 620 | + " * mean(), sum(), count(), etc.\n", |
| 621 | + "* Downsampling\n", |
| 622 | + " * Reduce datetime rows to a slower frequency\n", |
| 623 | + " * IE) Daily -> Weekly\n", |
| 624 | + "* Upsampling\n", |
| 625 | + " * Increase datetime rows to a faster frequency\n", |
| 626 | + " * IE) Daily -> Hourly" |
618 | 627 | ]
|
619 | 628 | },
|
620 | 629 | {
|
621 | 630 | "cell_type": "code",
|
622 | 631 | "execution_count": 5,
|
623 | 632 | "metadata": {},
|
624 | | - "outputs": [], |
| 633 | + "outputs": [ |
| 634 | + { |
| 635 | + "data": { |
| 636 | + "text/html": [ |
| 637 | + "<div>\n", |
| 638 | + "<style scoped>\n", |
| 639 | + " .dataframe tbody tr th:only-of-type {\n", |
| 640 | + " vertical-align: middle;\n", |
| 641 | + " }\n", |
| 642 | + "\n", |
| 643 | + " .dataframe tbody tr th {\n", |
| 644 | + " vertical-align: top;\n", |
| 645 | + " }\n", |
| 646 | + "\n", |
| 647 | + " .dataframe thead th {\n", |
| 648 | + " text-align: right;\n", |
| 649 | + " }\n", |
| 650 | + "</style>\n", |
| 651 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 652 | + " <thead>\n", |
| 653 | + " <tr style=\"text-align: right;\">\n", |
| 654 | + " <th></th>\n", |
| 655 | + " <th>Flight Number</th>\n", |
| 656 | + " <th>Scheduled Elapsed Time(Minutes)</th>\n", |
| 657 | + " <th>Actual Elapsed Time(Minutes)</th>\n", |
| 658 | + " <th>Departure Delay(Minutes)</th>\n", |
| 659 | + " <th>Taxi-out Time(Minutes)</th>\n", |
| 660 | + " <th>DelayCarrier(Minutes)</th>\n", |
| 661 | + " <th>DelayWeather(Minutes)</th>\n", |
| 662 | + " <th>DelayNational Aviation System(Minutes)</th>\n", |
| 663 | + " <th>DelaySecurity(Minutes)</th>\n", |
| 664 | + " <th>DelayLate Aircraft Arrival(Minutes)</th>\n", |
| 665 | + " <th>Unnamed: 17</th>\n", |
| 666 | + " </tr>\n", |
| 667 | + " <tr>\n", |
| 668 | + " <th>Date (MM/DD/YYYY)</th>\n", |
| 669 | + " <th></th>\n", |
| 670 | + " <th></th>\n", |
| 671 | + " <th></th>\n", |
| 672 | + " <th></th>\n", |
| 673 | + " <th></th>\n", |
| 674 | + " <th></th>\n", |
| 675 | + " <th></th>\n", |
| 676 | + " <th></th>\n", |
| 677 | + " <th></th>\n", |
| 678 | + " <th></th>\n", |
| 679 | + " <th></th>\n", |
| 680 | + " </tr>\n", |
| 681 | + " </thead>\n", |
| 682 | + " <tbody>\n", |
| 683 | + " <tr>\n", |
| 684 | + " <th>2015年07月05日</th>\n", |
| 685 | + " <td>1908.610487</td>\n", |
| 686 | + " <td>130.337079</td>\n", |
| 687 | + " <td>124.003745</td>\n", |
| 688 | + " <td>10.164794</td>\n", |
| 689 | + " <td>10.041199</td>\n", |
| 690 | + " <td>2.370787</td>\n", |
| 691 | + " <td>0.198502</td>\n", |
| 692 | + " <td>0.902622</td>\n", |
| 693 | + " <td>0.033708</td>\n", |
| 694 | + " <td>5.243446</td>\n", |
| 695 | + " <td>NaN</td>\n", |
| 696 | + " </tr>\n", |
| 697 | + " <tr>\n", |
| 698 | + " <th>2015年07月12日</th>\n", |
| 699 | + " <td>1930.015228</td>\n", |
| 700 | + " <td>129.809645</td>\n", |
| 701 | + " <td>126.302030</td>\n", |
| 702 | + " <td>7.370558</td>\n", |
| 703 | + " <td>10.098985</td>\n", |
| 704 | + " <td>1.680203</td>\n", |
| 705 | + " <td>0.101523</td>\n", |
| 706 | + " <td>1.192893</td>\n", |
| 707 | + " <td>0.015228</td>\n", |
| 708 | + " <td>3.934010</td>\n", |
| 709 | + " <td>NaN</td>\n", |
| 710 | + " </tr>\n", |
| 711 | + " <tr>\n", |
| 712 | + " <th>2015年07月19日</th>\n", |
| 713 | + " <td>1930.015228</td>\n", |
| 714 | + " <td>129.809645</td>\n", |
| 715 | + " <td>125.236041</td>\n", |
| 716 | + " <td>14.441624</td>\n", |
| 717 | + " <td>9.807107</td>\n", |
| 718 | + " <td>3.060914</td>\n", |
| 719 | + " <td>0.979695</td>\n", |
| 720 | + " <td>1.167513</td>\n", |
| 721 | + " <td>0.000000</td>\n", |
| 722 | + " <td>6.913706</td>\n", |
| 723 | + " <td>NaN</td>\n", |
| 724 | + " </tr>\n", |
| 725 | + " <tr>\n", |
| 726 | + " <th>2015年07月26日</th>\n", |
| 727 | + " <td>1930.015228</td>\n", |
| 728 | + " <td>129.809645</td>\n", |
| 729 | + " <td>123.413706</td>\n", |
| 730 | + " <td>10.418782</td>\n", |
| 731 | + " <td>9.804569</td>\n", |
| 732 | + " <td>5.395939</td>\n", |
| 733 | + " <td>0.000000</td>\n", |
| 734 | + " <td>1.124365</td>\n", |
| 735 | + " <td>0.000000</td>\n", |
| 736 | + " <td>2.621827</td>\n", |
| 737 | + " <td>NaN</td>\n", |
| 738 | + " </tr>\n", |
| 739 | + " <tr>\n", |
| 740 | + " <th>2015年08月02日</th>\n", |
| 741 | + " <td>1779.835052</td>\n", |
| 742 | + " <td>128.453608</td>\n", |
| 743 | + " <td>122.257732</td>\n", |
| 744 | + " <td>8.206186</td>\n", |
| 745 | + " <td>9.979381</td>\n", |
| 746 | + " <td>2.154639</td>\n", |
| 747 | + " <td>0.000000</td>\n", |
| 748 | + " <td>1.048110</td>\n", |
| 749 | + " <td>0.000000</td>\n", |
| 750 | + " <td>3.934708</td>\n", |
| 751 | + " <td>NaN</td>\n", |
| 752 | + " </tr>\n", |
| 753 | + " </tbody>\n", |
| 754 | + "</table>\n", |
| 755 | + "</div>" |
| 756 | + ], |
| 757 | + "text/plain": [ |
| 758 | + " Flight Number Scheduled Elapsed Time(Minutes) \\\n", |
| 759 | + "Date (MM/DD/YYYY) \n", |
| 760 | + "2015年07月05日 1908.610487 130.337079 \n", |
| 761 | + "2015年07月12日 1930.015228 129.809645 \n", |
| 762 | + "2015年07月19日 1930.015228 129.809645 \n", |
| 763 | + "2015年07月26日 1930.015228 129.809645 \n", |
| 764 | + "2015年08月02日 1779.835052 128.453608 \n", |
| 765 | + "\n", |
| 766 | + " Actual Elapsed Time(Minutes) Departure Delay(Minutes) \\\n", |
| 767 | + "Date (MM/DD/YYYY) \n", |
| 768 | + "2015年07月05日 124.003745 10.164794 \n", |
| 769 | + "2015年07月12日 126.302030 7.370558 \n", |
| 770 | + "2015年07月19日 125.236041 14.441624 \n", |
| 771 | + "2015年07月26日 123.413706 10.418782 \n", |
| 772 | + "2015年08月02日 122.257732 8.206186 \n", |
| 773 | + "\n", |
| 774 | + " Taxi-out Time(Minutes) DelayCarrier(Minutes) \\\n", |
| 775 | + "Date (MM/DD/YYYY) \n", |
| 776 | + "2015年07月05日 10.041199 2.370787 \n", |
| 777 | + "2015年07月12日 10.098985 1.680203 \n", |
| 778 | + "2015年07月19日 9.807107 3.060914 \n", |
| 779 | + "2015年07月26日 9.804569 5.395939 \n", |
| 780 | + "2015年08月02日 9.979381 2.154639 \n", |
| 781 | + "\n", |
| 782 | + " DelayWeather(Minutes) \\\n", |
| 783 | + "Date (MM/DD/YYYY) \n", |
| 784 | + "2015年07月05日 0.198502 \n", |
| 785 | + "2015年07月12日 0.101523 \n", |
| 786 | + "2015年07月19日 0.979695 \n", |
| 787 | + "2015年07月26日 0.000000 \n", |
| 788 | + "2015年08月02日 0.000000 \n", |
| 789 | + "\n", |
| 790 | + " DelayNational Aviation System(Minutes) \\\n", |
| 791 | + "Date (MM/DD/YYYY) \n", |
| 792 | + "2015年07月05日 0.902622 \n", |
| 793 | + "2015年07月12日 1.192893 \n", |
| 794 | + "2015年07月19日 1.167513 \n", |
| 795 | + "2015年07月26日 1.124365 \n", |
| 796 | + "2015年08月02日 1.048110 \n", |
| 797 | + "\n", |
| 798 | + " DelaySecurity(Minutes) \\\n", |
| 799 | + "Date (MM/DD/YYYY) \n", |
| 800 | + "2015年07月05日 0.033708 \n", |
| 801 | + "2015年07月12日 0.015228 \n", |
| 802 | + "2015年07月19日 0.000000 \n", |
| 803 | + "2015年07月26日 0.000000 \n", |
| 804 | + "2015年08月02日 0.000000 \n", |
| 805 | + "\n", |
| 806 | + " DelayLate Aircraft Arrival(Minutes) Unnamed: 17 \n", |
| 807 | + "Date (MM/DD/YYYY) \n", |
| 808 | + "2015年07月05日 5.243446 NaN \n", |
| 809 | + "2015年07月12日 3.934010 NaN \n", |
| 810 | + "2015年07月19日 6.913706 NaN \n", |
| 811 | + "2015年07月26日 2.621827 NaN \n", |
| 812 | + "2015年08月02日 3.934708 NaN " |
| 813 | + ] |
| 814 | + }, |
| 815 | + "execution_count": 5, |
| 816 | + "metadata": {}, |
| 817 | + "output_type": "execute_result" |
| 818 | + } |
| 819 | + ], |
625 | 820 | "source": [
|
626 | | - "# df.reindex(july_11_2015)" |
| 821 | + "# Downsampling\n", |
| 822 | + "weekly_mean = df.resample('W').mean()\n", |
| 823 | + "weekly_mean" |
627 | 824 | ]
|
628 | 825 | },
|
629 | 826 | {
|
630 | 827 | "cell_type": "markdown",
|
631 | 828 | "metadata": {},
|
632 | 829 | "source": [
|
633 | | - "### Filling missing values" |
| 830 | + "## Manipulating data\n", |
| 831 | + "\n", |
| 832 | + "* String methods\n", |
| 833 | + " * Substring matching" |
634 | 834 | ]
|
635 | 835 | },
|
636 | 836 | {
|
637 | 837 | "cell_type": "code",
|
638 | 838 | "execution_count": 6,
|
639 | 839 | "metadata": {},
|
640 | | - "outputs": [], |
| 840 | + "outputs": [ |
| 841 | + { |
| 842 | + "name": "stdout", |
| 843 | + "output_type": "stream", |
| 844 | + "text": [ |
| 845 | + "<class 'pandas.core.frame.DataFrame'>\n", |
| 846 | + "RangeIndex: 13374 entries, 0 to 13373\n", |
| 847 | + "Data columns (total 5 columns):\n", |
| 848 | + "CountryName 13374 non-null object\n", |
| 849 | + "CountryCode 13374 non-null object\n", |
| 850 | + "Year 13374 non-null int64\n", |
| 851 | + "Total Population 13374 non-null float64\n", |
| 852 | + "Urban population (% of total) 13374 non-null float64\n", |
| 853 | + "dtypes: float64(2), int64(1), object(2)\n", |
| 854 | + "memory usage: 522.5+ KB\n" |
| 855 | + ] |
| 856 | + }, |
| 857 | + { |
| 858 | + "data": { |
| 859 | + "text/html": [ |
| 860 | + "<div>\n", |
| 861 | + "<style scoped>\n", |
| 862 | + " .dataframe tbody tr th:only-of-type {\n", |
| 863 | + " vertical-align: middle;\n", |
| 864 | + " }\n", |
| 865 | + "\n", |
| 866 | + " .dataframe tbody tr th {\n", |
| 867 | + " vertical-align: top;\n", |
| 868 | + " }\n", |
| 869 | + "\n", |
| 870 | + " .dataframe thead th {\n", |
| 871 | + " text-align: right;\n", |
| 872 | + " }\n", |
| 873 | + "</style>\n", |
| 874 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 875 | + " <thead>\n", |
| 876 | + " <tr style=\"text-align: right;\">\n", |
| 877 | + " <th></th>\n", |
| 878 | + " <th>CountryName</th>\n", |
| 879 | + " <th>CountryCode</th>\n", |
| 880 | + " <th>Year</th>\n", |
| 881 | + " <th>Total Population</th>\n", |
| 882 | + " <th>Urban population (% of total)</th>\n", |
| 883 | + " </tr>\n", |
| 884 | + " </thead>\n", |
| 885 | + " <tbody>\n", |
| 886 | + " <tr>\n", |
| 887 | + " <th>0</th>\n", |
| 888 | + " <td>Arab World</td>\n", |
| 889 | + " <td>ARB</td>\n", |
| 890 | + " <td>1960</td>\n", |
| 891 | + " <td>9.249590e+07</td>\n", |
| 892 | + " <td>31.285384</td>\n", |
| 893 | + " </tr>\n", |
| 894 | + " <tr>\n", |
| 895 | + " <th>1</th>\n", |
| 896 | + " <td>Caribbean small states</td>\n", |
| 897 | + " <td>CSS</td>\n", |
| 898 | + " <td>1960</td>\n", |
| 899 | + " <td>4.190810e+06</td>\n", |
| 900 | + " <td>31.597490</td>\n", |
| 901 | + " </tr>\n", |
| 902 | + " <tr>\n", |
| 903 | + " <th>2</th>\n", |
| 904 | + " <td>Central Europe and the Baltics</td>\n", |
| 905 | + " <td>CEB</td>\n", |
| 906 | + " <td>1960</td>\n", |
| 907 | + " <td>9.140158e+07</td>\n", |
| 908 | + " <td>44.507921</td>\n", |
| 909 | + " </tr>\n", |
| 910 | + " <tr>\n", |
| 911 | + " <th>3</th>\n", |
| 912 | + " <td>East Asia & Pacific (all income levels)</td>\n", |
| 913 | + " <td>EAS</td>\n", |
| 914 | + " <td>1960</td>\n", |
| 915 | + " <td>1.042475e+09</td>\n", |
| 916 | + " <td>22.471132</td>\n", |
| 917 | + " </tr>\n", |
| 918 | + " <tr>\n", |
| 919 | + " <th>4</th>\n", |
| 920 | + " <td>East Asia & Pacific (developing only)</td>\n", |
| 921 | + " <td>EAP</td>\n", |
| 922 | + " <td>1960</td>\n", |
| 923 | + " <td>8.964930e+08</td>\n", |
| 924 | + " <td>16.917679</td>\n", |
| 925 | + " </tr>\n", |
| 926 | + " </tbody>\n", |
| 927 | + "</table>\n", |
| 928 | + "</div>" |
| 929 | + ], |
| 930 | + "text/plain": [ |
| 931 | + " CountryName CountryCode Year \\\n", |
| 932 | + "0 Arab World ARB 1960 \n", |
| 933 | + "1 Caribbean small states CSS 1960 \n", |
| 934 | + "2 Central Europe and the Baltics CEB 1960 \n", |
| 935 | + "3 East Asia & Pacific (all income levels) EAS 1960 \n", |
| 936 | + "4 East Asia & Pacific (developing only) EAP 1960 \n", |
| 937 | + "\n", |
| 938 | + " Total Population Urban population (% of total) \n", |
| 939 | + "0 9.249590e+07 31.285384 \n", |
| 940 | + "1 4.190810e+06 31.597490 \n", |
| 941 | + "2 9.140158e+07 44.507921 \n", |
| 942 | + "3 1.042475e+09 22.471132 \n", |
| 943 | + "4 8.964930e+08 16.917679 " |
| 944 | + ] |
| 945 | + }, |
| 946 | + "execution_count": 6, |
| 947 | + "metadata": {}, |
| 948 | + "output_type": "execute_result" |
| 949 | + } |
| 950 | + ], |
| 951 | + "source": [ |
| 952 | + "df = pd.read_csv('https://assets.datacamp.com/production/repositories/497/datasets/2175fef4b3691db03449bbc7ddffb740319c1131/world_ind_pop_data.csv')\n", |
| 953 | + "df.info()\n", |
| 954 | + "df.head()" |
| 955 | + ] |
| 956 | + }, |
| 957 | + { |
| 958 | + "cell_type": "code", |
| 959 | + "execution_count": 7, |
| 960 | + "metadata": {}, |
| 961 | + "outputs": [ |
| 962 | + { |
| 963 | + "name": "stdout", |
| 964 | + "output_type": "stream", |
| 965 | + "text": [ |
| 966 | + "<class 'pandas.core.frame.DataFrame'>\n", |
| 967 | + "Int64Index: 220 entries, 20 to 13303\n", |
| 968 | + "Data columns (total 5 columns):\n", |
| 969 | + "CountryName 220 non-null object\n", |
| 970 | + "CountryCode 220 non-null object\n", |
| 971 | + "Year 220 non-null int64\n", |
| 972 | + "Total Population 220 non-null float64\n", |
| 973 | + "Urban population (% of total) 220 non-null float64\n", |
| 974 | + "dtypes: float64(2), int64(1), object(2)\n", |
| 975 | + "memory usage: 10.3+ KB\n" |
| 976 | + ] |
| 977 | + } |
| 978 | + ], |
| 979 | + "source": [ |
| 980 | + "# Using .info() to find that there are 220 matching entries\n", |
| 981 | + "df[df['CountryName'].str.contains('North')].info()" |
| 982 | + ] |
| 983 | + }, |
| 984 | + { |
| 985 | + "cell_type": "code", |
| 986 | + "execution_count": 8, |
| 987 | + "metadata": {}, |
| 988 | + "outputs": [ |
| 989 | + { |
| 990 | + "data": { |
| 991 | + "text/plain": [ |
| 992 | + "220" |
| 993 | + ] |
| 994 | + }, |
| 995 | + "execution_count": 8, |
| 996 | + "metadata": {}, |
| 997 | + "output_type": "execute_result" |
| 998 | + } |
| 999 | + ], |
641 | 1000 | "source": [
|
642 | | - "# ffill = forward fill, bfill = back fill\n", |
643 | | - "# df.reindex(july_11_2015, method='ffill')" |
| 1001 | + "# Using .sum() to infer there are 220 matching entries\n", |
| 1002 | + "df['CountryName'].str.contains('North').sum()" |
644 | 1003 | ]
|
645 | 1004 | },
|
646 | 1005 | {
|
|
0 commit comments