Added snippets for Summarizing and Computing Descriptive Statistics.

This commit is contained in:
Donne Martin 2015-02-08 16:34:16 -05:00
parent f27f699078
commit 23e62231d2

View File

@ -1,7 +1,7 @@
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:a0f056a23acdf795f7de8660469dcb0323a659e710786e7d33a158a6b72c6c03" "signature": "sha256:364642bdaf4f304a679c7bbde135fea5a7e31eb8da19fb9b24192091e3e34cb0"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
@ -22,7 +22,8 @@
"* Arithmetic and Data Alignment\n", "* Arithmetic and Data Alignment\n",
"* Function Application and Mapping\n", "* Function Application and Mapping\n",
"* Sorting and Ranking\n", "* Sorting and Ranking\n",
"* Axis Indices with Duplicate Values" "* Axis Indices with Duplicate Values\n",
"* Summarizing and Computing Descriptive Statistics"
] ]
}, },
{ {
@ -5354,6 +5355,238 @@
} }
], ],
"prompt_number": 102 "prompt_number": 102
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summarizing and Computing Descriptive Statistics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Unlike NumPy arrays, Pandas descriptive statistics automatically exclude missing data. NaN values are excluded unless the entire row or column is NA."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_6"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>unempl</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> VA</td>\n",
" <td> 5.0</td>\n",
" <td> NaN</td>\n",
" <td> 2012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> VA</td>\n",
" <td> 5.1</td>\n",
" <td> NaN</td>\n",
" <td> 2013</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> VA</td>\n",
" <td> 5.2</td>\n",
" <td> 6.0</td>\n",
" <td> 2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> MD</td>\n",
" <td> 4.0</td>\n",
" <td> 6.0</td>\n",
" <td> 2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> MD</td>\n",
" <td> 4.1</td>\n",
" <td> 6.1</td>\n",
" <td> 2015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 120,
"text": [
" state pop unempl year\n",
"0 VA 5.0 NaN 2012\n",
"1 VA 5.1 NaN 2013\n",
"2 VA 5.2 6.0 2014\n",
"3 MD 4.0 6.0 2014\n",
"4 MD 4.1 6.1 2015\n",
"5 NaN NaN NaN NaN\n",
"6 NaN NaN NaN NaN"
]
}
],
"prompt_number": 120
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_6.sum()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 112,
"text": [
"pop 23.4\n",
"unempl 18.1\n",
"year 10068.0\n",
"dtype: float64"
]
}
],
"prompt_number": 112
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sum over the rows:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_6.sum(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 113,
"text": [
"0 2017.0\n",
"1 2018.1\n",
"2 2025.2\n",
"3 2024.0\n",
"4 2025.2\n",
"5 NaN\n",
"6 NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 113
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Account for NaNs:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_6.sum(axis=1, skipna=False)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 115,
"text": [
"0 NaN\n",
"1 NaN\n",
"2 2025.2\n",
"3 2024.0\n",
"4 2025.2\n",
"5 NaN\n",
"6 NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 115
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_6.idxmax()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: MD",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-117-eca0089f6d98>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0midxmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36midxmax\u001b[0;34m(self, axis, skipna)\u001b[0m\n\u001b[1;32m 4197\u001b[0m \"\"\"\n\u001b[1;32m 4198\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4199\u001b[0;31m \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnanops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnanargmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4200\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4201\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mNA\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36mnanargmax\u001b[0;34m(values, axis, skipna)\u001b[0m\n\u001b[1;32m 432\u001b[0m \"\"\"\n\u001b[1;32m 433\u001b[0m values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',\n\u001b[0;32m--> 434\u001b[0;31m isfinite=True)\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_maybe_arg_null_out\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_get_values\u001b[0;34m(values, skipna, fill_value, fill_value_typ, isfinite, copy)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_values_from_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misfinite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 159\u001b[0;31m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_isfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 160\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_isfinite\u001b[0;34m(values)\u001b[0m\n\u001b[1;32m 200\u001b[0m is_integer_dtype(values) or is_bool_dtype(values)):\n\u001b[1;32m 201\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'float64'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: MD"
]
}
],
"prompt_number": 117
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
} }
], ],
"metadata": {} "metadata": {}