From 23e62231d2c44cdb50164ab3a106201de14ebe4d Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Sun, 8 Feb 2015 16:34:16 -0500 Subject: [PATCH] Added snippets for Summarizing and Computing Descriptive Statistics. --- pandas/pandas.ipynb | 237 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 235 insertions(+), 2 deletions(-) diff --git a/pandas/pandas.ipynb b/pandas/pandas.ipynb index 46f32ac..c0c9169 100644 --- a/pandas/pandas.ipynb +++ b/pandas/pandas.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:a0f056a23acdf795f7de8660469dcb0323a659e710786e7d33a158a6b72c6c03" + "signature": "sha256:364642bdaf4f304a679c7bbde135fea5a7e31eb8da19fb9b24192091e3e34cb0" }, "nbformat": 3, "nbformat_minor": 0, @@ -22,7 +22,8 @@ "* Arithmetic and Data Alignment\n", "* Function Application and Mapping\n", "* Sorting and Ranking\n", - "* Axis Indices with Duplicate Values" + "* Axis Indices with Duplicate Values\n", + "* Summarizing and Computing Descriptive Statistics" ] }, { @@ -5354,6 +5355,238 @@ } ], "prompt_number": 102 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarizing and Computing Descriptive Statistics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Unlike NumPy arrays, Pandas descriptive statistics automatically exclude missing data. NaN values are excluded unless the entire row or column is NA." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_6" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statepopunemplyear
0 VA 5.0 NaN 2012
1 VA 5.1 NaN 2013
2 VA 5.2 6.0 2014
3 MD 4.0 6.0 2014
4 MD 4.1 6.1 2015
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 120, + "text": [ + " state pop unempl year\n", + "0 VA 5.0 NaN 2012\n", + "1 VA 5.1 NaN 2013\n", + "2 VA 5.2 6.0 2014\n", + "3 MD 4.0 6.0 2014\n", + "4 MD 4.1 6.1 2015\n", + "5 NaN NaN NaN NaN\n", + "6 NaN NaN NaN NaN" + ] + } + ], + "prompt_number": 120 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_6.sum()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 112, + "text": [ + "pop 23.4\n", + "unempl 18.1\n", + "year 10068.0\n", + "dtype: float64" + ] + } + ], + "prompt_number": 112 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sum over the rows:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_6.sum(axis=1)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 113, + "text": [ + "0 2017.0\n", + "1 2018.1\n", + "2 2025.2\n", + "3 2024.0\n", + "4 2025.2\n", + "5 NaN\n", + "6 NaN\n", + "dtype: float64" + ] + } + ], + "prompt_number": 113 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Account for NaNs:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_6.sum(axis=1, skipna=False)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 115, + "text": [ + "0 NaN\n", + "1 NaN\n", + "2 2025.2\n", + "3 2024.0\n", + "4 2025.2\n", + "5 NaN\n", + "6 NaN\n", + "dtype: float64" + ] + } + ], + "prompt_number": 115 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_6.idxmax()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: MD", + "output_type": "pyerr", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0midxmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36midxmax\u001b[0;34m(self, axis, skipna)\u001b[0m\n\u001b[1;32m 4197\u001b[0m \"\"\"\n\u001b[1;32m 4198\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4199\u001b[0;31m \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnanops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnanargmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4200\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4201\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mNA\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36mnanargmax\u001b[0;34m(values, axis, skipna)\u001b[0m\n\u001b[1;32m 432\u001b[0m \"\"\"\n\u001b[1;32m 433\u001b[0m values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',\n\u001b[0;32m--> 434\u001b[0;31m isfinite=True)\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_maybe_arg_null_out\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_get_values\u001b[0;34m(values, skipna, fill_value, fill_value_typ, isfinite, copy)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_values_from_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misfinite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 159\u001b[0;31m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_isfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 160\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_isfinite\u001b[0;34m(values)\u001b[0m\n\u001b[1;32m 200\u001b[0m is_integer_dtype(values) or is_bool_dtype(values)):\n\u001b[1;32m 201\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'float64'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: could not convert string to float: MD" + ] + } + ], + "prompt_number": 117 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] } ], "metadata": {}