mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added snippets for Summarizing and Computing Descriptive Statistics.
This commit is contained in:
parent
f27f699078
commit
23e62231d2
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"name": "",
|
"name": "",
|
||||||
"signature": "sha256:a0f056a23acdf795f7de8660469dcb0323a659e710786e7d33a158a6b72c6c03"
|
"signature": "sha256:364642bdaf4f304a679c7bbde135fea5a7e31eb8da19fb9b24192091e3e34cb0"
|
||||||
},
|
},
|
||||||
"nbformat": 3,
|
"nbformat": 3,
|
||||||
"nbformat_minor": 0,
|
"nbformat_minor": 0,
|
||||||
|
@ -22,7 +22,8 @@
|
||||||
"* Arithmetic and Data Alignment\n",
|
"* Arithmetic and Data Alignment\n",
|
||||||
"* Function Application and Mapping\n",
|
"* Function Application and Mapping\n",
|
||||||
"* Sorting and Ranking\n",
|
"* Sorting and Ranking\n",
|
||||||
"* Axis Indices with Duplicate Values"
|
"* Axis Indices with Duplicate Values\n",
|
||||||
|
"* Summarizing and Computing Descriptive Statistics"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -5354,6 +5355,238 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"prompt_number": 102
|
"prompt_number": 102
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Summarizing and Computing Descriptive Statistics"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Unlike NumPy arrays, Pandas descriptive statistics automatically exclude missing data. NaN values are excluded unless the entire row or column is NA."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_6"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"html": [
|
||||||
|
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>state</th>\n",
|
||||||
|
" <th>pop</th>\n",
|
||||||
|
" <th>unempl</th>\n",
|
||||||
|
" <th>year</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td> VA</td>\n",
|
||||||
|
" <td> 5.0</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> 2012</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td> VA</td>\n",
|
||||||
|
" <td> 5.1</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> 2013</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td> VA</td>\n",
|
||||||
|
" <td> 5.2</td>\n",
|
||||||
|
" <td> 6.0</td>\n",
|
||||||
|
" <td> 2014</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td> MD</td>\n",
|
||||||
|
" <td> 4.0</td>\n",
|
||||||
|
" <td> 6.0</td>\n",
|
||||||
|
" <td> 2014</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td> MD</td>\n",
|
||||||
|
" <td> 4.1</td>\n",
|
||||||
|
" <td> 6.1</td>\n",
|
||||||
|
" <td> 2015</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" <td> NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "pyout",
|
||||||
|
"prompt_number": 120,
|
||||||
|
"text": [
|
||||||
|
" state pop unempl year\n",
|
||||||
|
"0 VA 5.0 NaN 2012\n",
|
||||||
|
"1 VA 5.1 NaN 2013\n",
|
||||||
|
"2 VA 5.2 6.0 2014\n",
|
||||||
|
"3 MD 4.0 6.0 2014\n",
|
||||||
|
"4 MD 4.1 6.1 2015\n",
|
||||||
|
"5 NaN NaN NaN NaN\n",
|
||||||
|
"6 NaN NaN NaN NaN"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 120
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_6.sum()"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "pyout",
|
||||||
|
"prompt_number": 112,
|
||||||
|
"text": [
|
||||||
|
"pop 23.4\n",
|
||||||
|
"unempl 18.1\n",
|
||||||
|
"year 10068.0\n",
|
||||||
|
"dtype: float64"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 112
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Sum over the rows:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_6.sum(axis=1)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "pyout",
|
||||||
|
"prompt_number": 113,
|
||||||
|
"text": [
|
||||||
|
"0 2017.0\n",
|
||||||
|
"1 2018.1\n",
|
||||||
|
"2 2025.2\n",
|
||||||
|
"3 2024.0\n",
|
||||||
|
"4 2025.2\n",
|
||||||
|
"5 NaN\n",
|
||||||
|
"6 NaN\n",
|
||||||
|
"dtype: float64"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 113
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Account for NaNs:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_6.sum(axis=1, skipna=False)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "pyout",
|
||||||
|
"prompt_number": 115,
|
||||||
|
"text": [
|
||||||
|
"0 NaN\n",
|
||||||
|
"1 NaN\n",
|
||||||
|
"2 2025.2\n",
|
||||||
|
"3 2024.0\n",
|
||||||
|
"4 2025.2\n",
|
||||||
|
"5 NaN\n",
|
||||||
|
"6 NaN\n",
|
||||||
|
"dtype: float64"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 115
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df_6.idxmax()"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "could not convert string to float: MD",
|
||||||
|
"output_type": "pyerr",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-117-eca0089f6d98>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0midxmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36midxmax\u001b[0;34m(self, axis, skipna)\u001b[0m\n\u001b[1;32m 4197\u001b[0m \"\"\"\n\u001b[1;32m 4198\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4199\u001b[0;31m \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnanops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnanargmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4200\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4201\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mNA\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36mnanargmax\u001b[0;34m(values, axis, skipna)\u001b[0m\n\u001b[1;32m 432\u001b[0m \"\"\"\n\u001b[1;32m 433\u001b[0m values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',\n\u001b[0;32m--> 434\u001b[0;31m isfinite=True)\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_maybe_arg_null_out\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mskipna\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_get_values\u001b[0;34m(values, skipna, fill_value, fill_value_typ, isfinite, copy)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_values_from_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misfinite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 159\u001b[0;31m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_isfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 160\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;32m/Users/dmartin/anaconda/lib/python2.7/site-packages/pandas/core/nanops.pyc\u001b[0m in \u001b[0;36m_isfinite\u001b[0;34m(values)\u001b[0m\n\u001b[1;32m 200\u001b[0m is_integer_dtype(values) or is_bool_dtype(values)):\n\u001b[1;32m 201\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m~\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfinite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'float64'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: MD"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 117
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {}
|
"metadata": {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user