diff --git a/pandas/pandas.ipynb b/pandas/pandas.ipynb index ee247f8..6c8c4fd 100644 --- a/pandas/pandas.ipynb +++ b/pandas/pandas.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:17491453ae73630f23f856b3c0724fbd00a52c4f53b239a07e340ec3113ea230" + "signature": "sha256:2b6aa402b58aa2da8c06d378f19732970903abe573b39f9a7490982d0e2ebcbc" }, "nbformat": 3, "nbformat_minor": 0, @@ -3331,6 +3331,13 @@ "## Arithmetic and Data Alignment" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding Series objects results in the union of index pairs if the pairs are not the same, resulting in NaN for indices that do not overlap:" + ] + }, { "cell_type": "code", "collapsed": false, @@ -3338,7 +3345,7 @@ "np.random.seed(0)\n", "ser_6 = Series(np.random.randn(5),\n", " index=['a', 'b', 'c', 'd', 'e'])\n", - "ser_6\n" + "ser_6" ], "language": "python", "metadata": {}, @@ -3387,13 +3394,6 @@ ], "prompt_number": 65 }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding objects results in the union of index pairs if the pairs are not the same, resulting in NaN for indices that do not overlap:" - ] - }, { "cell_type": "code", "collapsed": false, @@ -3455,6 +3455,13 @@ ], "prompt_number": 67 }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding DataFrame objects results in the union of index pairs for rows and columns if the pairs are not the same, resulting in NaN for indices that do not overlap:" + ] + }, { "cell_type": "code", "collapsed": false, @@ -3575,13 +3582,6 @@ ], "prompt_number": 69 }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding objects results in the union of index pairs for rows and columns if the pairs are not the same, resulting in NaN for indices that do not overlap:" - ] - }, { "cell_type": "code", "collapsed": false, @@ -3784,6 +3784,13 @@ ], "prompt_number": 72 }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Match the index of the Series on the DataFrame's columns, broadcasting down the rows and union the indices that do not match:" + ] + }, { "cell_type": "code", "collapsed": false, @@ -3808,13 +3815,6 @@ ], "prompt_number": 73 }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Match the index of the Series on the DataFrame's columns, broadcasting down the rows and union the indices that do not match:" - ] - }, { "cell_type": "code", "collapsed": false, @@ -3880,6 +3880,13 @@ ], "prompt_number": 74 }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Broadcast over the columns and match the rows (axis=0) by using an arithmetic method:" + ] + }, { "cell_type": "code", "collapsed": false, @@ -3946,37 +3953,30 @@ "collapsed": false, "input": [ "ser_10 = Series([100, 200, 300])\n", - "print ser_10" + "ser_10" ], "language": "python", "metadata": {}, "outputs": [ { - "output_type": "stream", - "stream": "stdout", + "metadata": {}, + "output_type": "pyout", + "prompt_number": 76, "text": [ "0 100\n", "1 200\n", "2 300\n", - "dtype: int64\n" + "dtype: int64" ] } ], "prompt_number": 76 }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Broadcast over the columns and match the rows (axis=0) by using an arithmetic method:" - ] - }, { "cell_type": "code", "collapsed": false, "input": [ - "df_11 = df_10.sub(ser_10, axis=0)\n", - "df_11" + "df_10.sub(ser_10, axis=0)" ], "language": "python", "metadata": {}, @@ -4051,8 +4051,8 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_12 = np.abs(df_11)\n", - "df_12" + "df_11 = np.abs(df_11)\n", + "df_11" ], "language": "python", "metadata": {}, @@ -4073,24 +4073,24 @@ " \n", " \n", " 0\n", - " 99.451186\n", - " 98.867789\n", - " 98.676912\n", - " 99.999886\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", " \n", " \n", " 1\n", - " 199.455117\n", - " 199.274013\n", - " 199.207350\n", - " 199.907661\n", + " 0.003930\n", + " 0.406224\n", + " 0.530438\n", + " 0.092224\n", " \n", " \n", " 2\n", - " 299.562413\n", - " 298.921967\n", - " 298.690777\n", - " 299.603233\n", + " 0.111226\n", + " 0.054178\n", + " 0.013864\n", + " 0.396653\n", " \n", " \n", "\n", @@ -4100,10 +4100,10 @@ "output_type": "pyout", "prompt_number": 78, "text": [ - " a b c d\n", - "0 99.451186 98.867789 98.676912 99.999886\n", - "1 199.455117 199.274013 199.207350 199.907661\n", - "2 299.562413 298.921967 298.690777 299.603233" + " a b c d\n", + "0 0.000000 0.000000 0.000000 0.000000\n", + "1 0.003930 0.406224 0.530438 0.092224\n", + "2 0.111226 0.054178 0.013864 0.396653" ] } ], @@ -4121,7 +4121,7 @@ "collapsed": false, "input": [ "func_1 = lambda x: x.max() - x.min()\n", - "df_12.apply(func_1)" + "df_11.apply(func_1)" ], "language": "python", "metadata": {}, @@ -4131,10 +4131,10 @@ "output_type": "pyout", "prompt_number": 79, "text": [ - "a 200.111226\n", - "b 200.054178\n", - "c 200.013864\n", - "d 199.603347\n", + "a 0.111226\n", + "b 0.406224\n", + "c 0.530438\n", + "d 0.396653\n", "dtype: float64" ] } @@ -4152,7 +4152,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_12.apply(func_1, axis=1)" + "df_11.apply(func_1, axis=1)" ], "language": "python", "metadata": {}, @@ -4162,9 +4162,9 @@ "output_type": "pyout", "prompt_number": 80, "text": [ - "0 1.322973\n", - "1 0.700311\n", - "2 0.912456\n", + "0 0.000000\n", + "1 0.526508\n", + "2 0.382789\n", "dtype: float64" ] } @@ -4183,7 +4183,7 @@ "collapsed": false, "input": [ "func_2 = lambda x: Series([x.min(), x.max()], index=['min', 'max'])\n", - "df_12.apply(func_2)" + "df_11.apply(func_2)" ], "language": "python", "metadata": {}, @@ -4204,17 +4204,17 @@ " \n", " \n", " min\n", - " 99.451186\n", - " 98.867789\n", - " 98.676912\n", - " 99.999886\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", " \n", " \n", " max\n", - " 299.562413\n", - " 298.921967\n", - " 298.690777\n", - " 299.603233\n", + " 0.111226\n", + " 0.406224\n", + " 0.530438\n", + " 0.396653\n", " \n", " \n", "\n", @@ -4224,9 +4224,9 @@ "output_type": "pyout", "prompt_number": 81, "text": [ - " a b c d\n", - "min 99.451186 98.867789 98.676912 99.999886\n", - "max 299.562413 298.921967 298.690777 299.603233" + " a b c d\n", + "min 0.000000 0.000000 0.000000 0.000000\n", + "max 0.111226 0.406224 0.530438 0.396653" ] } ], @@ -4244,7 +4244,7 @@ "collapsed": false, "input": [ "func_3 = lambda x: '%.2f' %x\n", - "df_12.applymap(func_3)" + "df_11.applymap(func_3)" ], "language": "python", "metadata": {}, @@ -4265,24 +4265,24 @@ " \n", " \n", " 0\n", - " 99.45\n", - " 98.87\n", - " 98.68\n", - " 100.00\n", + " 0.00\n", + " 0.00\n", + " 0.00\n", + " 0.00\n", " \n", " \n", " 1\n", - " 199.46\n", - " 199.27\n", - " 199.21\n", - " 199.91\n", + " 0.00\n", + " 0.41\n", + " 0.53\n", + " 0.09\n", " \n", " \n", " 2\n", - " 299.56\n", - " 298.92\n", - " 298.69\n", - " 299.60\n", + " 0.11\n", + " 0.05\n", + " 0.01\n", + " 0.40\n", " \n", " \n", "\n", @@ -4292,10 +4292,10 @@ "output_type": "pyout", "prompt_number": 82, "text": [ - " a b c d\n", - "0 99.45 98.87 98.68 100.00\n", - "1 199.46 199.27 199.21 199.91\n", - "2 299.56 298.92 298.69 299.60" + " a b c d\n", + "0 0.00 0.00 0.00 0.00\n", + "1 0.00 0.41 0.53 0.09\n", + "2 0.11 0.05 0.01 0.40" ] } ], @@ -4312,7 +4312,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_12['a'].map(func_3)" + "df_11['a'].map(func_3)" ], "language": "python", "metadata": {}, @@ -4322,9 +4322,9 @@ "output_type": "pyout", "prompt_number": 83, "text": [ - "0 99.45\n", - "1 199.46\n", - "2 299.56\n", + "0 0.00\n", + "1 0.00\n", + "2 0.11\n", "Name: a, dtype: object" ] } @@ -4350,7 +4350,7 @@ { "metadata": {}, "output_type": "pyout", - "prompt_number": 114, + "prompt_number": 84, "text": [ "fo 100\n", "br 200\n", @@ -4360,7 +4360,7 @@ ] } ], - "prompt_number": 114 + "prompt_number": 84 }, { "cell_type": "markdown", @@ -4381,7 +4381,7 @@ { "metadata": {}, "output_type": "pyout", - "prompt_number": 115, + "prompt_number": 85, "text": [ "br 200\n", "bz 300\n", @@ -4391,7 +4391,7 @@ ] } ], - "prompt_number": 115 + "prompt_number": 85 }, { "cell_type": "markdown", @@ -4412,7 +4412,7 @@ { "metadata": {}, "output_type": "pyout", - "prompt_number": 132, + "prompt_number": 86, "text": [ "fo 100\n", "br 200\n", @@ -4422,16 +4422,16 @@ ] } ], - "prompt_number": 132 + "prompt_number": 86 }, { "cell_type": "code", "collapsed": false, "input": [ - "df_13 = DataFrame(np.arange(12).reshape((3, 4)),\n", + "df_12 = DataFrame(np.arange(12).reshape((3, 4)),\n", " index=['three', 'one', 'two'],\n", " columns=['c', 'a', 'b', 'd'])\n", - "df_13" + "df_12" ], "language": "python", "metadata": {}, @@ -4477,7 +4477,7 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 128, + "prompt_number": 87, "text": [ " c a b d\n", "three 0 1 2 3\n", @@ -4486,7 +4486,7 @@ ] } ], - "prompt_number": 128 + "prompt_number": 87 }, { "cell_type": "markdown", @@ -4499,7 +4499,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_13.sort_index()" + "df_12.sort_index()" ], "language": "python", "metadata": {}, @@ -4545,7 +4545,7 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 129, + "prompt_number": 88, "text": [ " c a b d\n", "one 4 5 6 7\n", @@ -4554,7 +4554,7 @@ ] } ], - "prompt_number": 129 + "prompt_number": 88 }, { "cell_type": "markdown", @@ -4567,7 +4567,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_13.sort_index(axis=1, ascending=False)" + "df_12.sort_index(axis=1, ascending=False)" ], "language": "python", "metadata": {}, @@ -4613,7 +4613,7 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 131, + "prompt_number": 89, "text": [ " d c b a\n", "three 3 0 2 1\n", @@ -4622,7 +4622,7 @@ ] } ], - "prompt_number": 131 + "prompt_number": 89 }, { "cell_type": "markdown", @@ -4635,7 +4635,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "df_13.sort_index(by=['d', 'c'])" + "df_12.sort_index(by=['d', 'c'])" ], "language": "python", "metadata": {}, @@ -4681,7 +4681,7 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 134, + "prompt_number": 90, "text": [ " c a b d\n", "three 0 1 2 3\n", @@ -4690,7 +4690,142 @@ ] } ], - "prompt_number": 134 + "prompt_number": 90 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ranking is similar to numpy.argsort except that ties are broken by assigning each group the mean rank:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ser_11 = Series([7, -5, 7, 4, 2, 0, 4, 7])\n", + "ser_11 = ser_11.order()\n", + "ser_11" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 91, + "text": [ + "1 -5\n", + "5 0\n", + "4 2\n", + "3 4\n", + "6 4\n", + "0 7\n", + "2 7\n", + "7 7\n", + "dtype: int64" + ] + } + ], + "prompt_number": 91 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ser_11.rank()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 92, + "text": [ + "1 1.0\n", + "5 2.0\n", + "4 3.0\n", + "3 4.5\n", + "6 4.5\n", + "0 7.0\n", + "2 7.0\n", + "7 7.0\n", + "dtype: float64" + ] + } + ], + "prompt_number": 92 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rank a Series according to when they appear in the data:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ser_11.rank(method='first')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 93, + "text": [ + "1 1\n", + "5 2\n", + "4 3\n", + "3 4\n", + "6 5\n", + "0 6\n", + "2 7\n", + "7 8\n", + "dtype: float64" + ] + } + ], + "prompt_number": 93 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rank a Series in descending order, using the maximum rank for the group:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ser_11.rank(ascending=False, method='max')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 94, + "text": [ + "1 8\n", + "5 7\n", + "4 6\n", + "3 5\n", + "6 5\n", + "0 3\n", + "2 3\n", + "7 3\n", + "dtype: float64" + ] + } + ], + "prompt_number": 94 } ], "metadata": {}