From e982080f0833820ffd0d5ab0febfbbf329d848af Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Wed, 28 Jan 2015 07:23:38 -0500 Subject: [PATCH] Added DataFrame snippets. --- pandas/pandas.ipynb | 1100 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1091 insertions(+), 9 deletions(-) diff --git a/pandas/pandas.ipynb b/pandas/pandas.ipynb index 3a09034..2f0f8a5 100644 --- a/pandas/pandas.ipynb +++ b/pandas/pandas.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:5af6c8db3042b9d07306a075e560855c3bd9a73234feb466482830d025b58068" + "signature": "sha256:1d555e34f97d4a24383bba48a1c34b1526e08e18276d519d2c8afaf3ff0550f4" }, "nbformat": 3, "nbformat_minor": 0, @@ -12,14 +12,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Pandas" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basics" + "# Pandas\n", + "\n", + "* Series\n", + "* DataFrame" ] }, { @@ -569,6 +565,1092 @@ } ], "prompt_number": 19 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame\n", + "\n", + "A DataFrame is a tabular data structure containing an ordered collection of columns. Each column can have a different type. DataFrames have both row and column indices and is analogous to a dict of Series. Row and column operations are treated roughly symmetrically. Columns returned when indexing a DataFrame are views of the underlying data, not a copy. To obtain a copy, use the Series' copy method.\n", + "\n", + "Create a DataFrame:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data_1 = {'state': ['VA', 'VA', 'VA', 'MD', 'MD'],\n", + " 'year': [2012, 2013, 2014, 2014, 2015],\n", + " 'pop': [5.0, 5.1, 5.2, 4.0, 4.1]}\n", + "frame_1 = DataFrame(data_1)\n", + "frame_1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "

5 rows \u00d7 3 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 20, + "text": [ + " pop state year\n", + "0 5.0 VA 2012\n", + "1 5.1 VA 2013\n", + "2 5.2 VA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015\n", + "\n", + "[5 rows x 3 columns]" + ] + } + ], + "prompt_number": 20 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame specifying a sequence of columns:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_2 = DataFrame(data_1, columns=['year', 'state', 'pop'])\n", + "frame_2" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepop
0 2012 VA 5.0
1 2013 VA 5.1
2 2014 VA 5.2
3 2014 MD 4.0
4 2015 MD 4.1
\n", + "

5 rows \u00d7 3 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 21, + "text": [ + " year state pop\n", + "0 2012 VA 5.0\n", + "1 2013 VA 5.1\n", + "2 2014 VA 5.2\n", + "3 2014 MD 4.0\n", + "4 2015 MD 4.1\n", + "\n", + "[5 rows x 3 columns]" + ] + } + ], + "prompt_number": 21 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like Series, columns that are not present in the data are NaN:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3 = DataFrame(data_1, columns=['year', 'state', 'pop', 'unempl'])\n", + "frame_3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepopunempl
0 2012 VA 5.0 NaN
1 2013 VA 5.1 NaN
2 2014 VA 5.2 NaN
3 2014 MD 4.0 NaN
4 2015 MD 4.1 NaN
\n", + "

5 rows \u00d7 4 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 22, + "text": [ + " year state pop unempl\n", + "0 2012 VA 5.0 NaN\n", + "1 2013 VA 5.1 NaN\n", + "2 2014 VA 5.2 NaN\n", + "3 2014 MD 4.0 NaN\n", + "4 2015 MD 4.1 NaN\n", + "\n", + "[5 rows x 4 columns]" + ] + } + ], + "prompt_number": 22 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieve a column by key, returning a Series:\n" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3['state']" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 23, + "text": [ + "0 VA\n", + "1 VA\n", + "2 VA\n", + "3 MD\n", + "4 MD\n", + "Name: state, dtype: object" + ] + } + ], + "prompt_number": 23 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrive a column by attribute, returning a Series:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3.year" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 24, + "text": [ + "0 2012\n", + "1 2013\n", + "2 2014\n", + "3 2014\n", + "4 2015\n", + "Name: year, dtype: int64" + ] + } + ], + "prompt_number": 24 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieve a row by position:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3.ix[0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 25, + "text": [ + "year 2012\n", + "state VA\n", + "pop 5\n", + "unempl NaN\n", + "Name: 0, dtype: object" + ] + } + ], + "prompt_number": 25 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Update a column by assignment:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3['unempl'] = np.arange(5)\n", + "frame_3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepopunempl
0 2012 VA 5.0 0
1 2013 VA 5.1 1
2 2014 VA 5.2 2
3 2014 MD 4.0 3
4 2015 MD 4.1 4
\n", + "

5 rows \u00d7 4 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 26, + "text": [ + " year state pop unempl\n", + "0 2012 VA 5.0 0\n", + "1 2013 VA 5.1 1\n", + "2 2014 VA 5.2 2\n", + "3 2014 MD 4.0 3\n", + "4 2015 MD 4.1 4\n", + "\n", + "[5 rows x 4 columns]" + ] + } + ], + "prompt_number": 26 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assign a Series to a column (note if assigning a list or array, the length must match the DataFrame, unlike a Series):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "unempl = Series([6.0, 6.0, 6.1], index=[2, 3, 4])\n", + "frame_3['unempl'] = unempl\n", + "frame_3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepopunempl
0 2012 VA 5.0 NaN
1 2013 VA 5.1 NaN
2 2014 VA 5.2 6.0
3 2014 MD 4.0 6.0
4 2015 MD 4.1 6.1
\n", + "

5 rows \u00d7 4 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 27, + "text": [ + " year state pop unempl\n", + "0 2012 VA 5.0 NaN\n", + "1 2013 VA 5.1 NaN\n", + "2 2014 VA 5.2 6.0\n", + "3 2014 MD 4.0 6.0\n", + "4 2015 MD 4.1 6.1\n", + "\n", + "[5 rows x 4 columns]" + ] + } + ], + "prompt_number": 27 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assign a new column that doesn't exist to create a new column:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3['state_dup'] = frame_3['state']\n", + "frame_3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepopunemplstate_dup
0 2012 VA 5.0 NaN VA
1 2013 VA 5.1 NaN VA
2 2014 VA 5.2 6.0 VA
3 2014 MD 4.0 6.0 MD
4 2015 MD 4.1 6.1 MD
\n", + "

5 rows \u00d7 5 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 28, + "text": [ + " year state pop unempl state_dup\n", + "0 2012 VA 5.0 NaN VA\n", + "1 2013 VA 5.1 NaN VA\n", + "2 2014 VA 5.2 6.0 VA\n", + "3 2014 MD 4.0 6.0 MD\n", + "4 2015 MD 4.1 6.1 MD\n", + "\n", + "[5 rows x 5 columns]" + ] + } + ], + "prompt_number": 28 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Delete a column:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "del frame_3['state_dup']\n", + "frame_3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearstatepopunempl
0 2012 VA 5.0 NaN
1 2013 VA 5.1 NaN
2 2014 VA 5.2 6.0
3 2014 MD 4.0 6.0
4 2015 MD 4.1 6.1
\n", + "

5 rows \u00d7 4 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 29, + "text": [ + " year state pop unempl\n", + "0 2012 VA 5.0 NaN\n", + "1 2013 VA 5.1 NaN\n", + "2 2014 VA 5.2 6.0\n", + "3 2014 MD 4.0 6.0\n", + "4 2015 MD 4.1 6.1\n", + "\n", + "[5 rows x 4 columns]" + ] + } + ], + "prompt_number": 29 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame from a nested dict of dicts (the keys in the inner dicts are unioned and sorted to form the index in the result, unless an explicit index is specified):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pop = {'VA' : {2013 : 5.1, 2014 : 5.2},\n", + " 'MD' : {2014 : 4.0, 2015 : 4.1}}\n", + "frame_4 = DataFrame(pop)\n", + "frame_4" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MDVA
2013 NaN 5.1
2014 4.0 5.2
2015 4.1 NaN
\n", + "

3 rows \u00d7 2 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 30, + "text": [ + " MD VA\n", + "2013 NaN 5.1\n", + "2014 4.0 5.2\n", + "2015 4.1 NaN\n", + "\n", + "[3 rows x 2 columns]" + ] + } + ], + "prompt_number": 30 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transpose the DataFrame:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_4.T" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
201320142015
MD NaN 4.0 4.1
VA 5.1 5.2 NaN
\n", + "

2 rows \u00d7 3 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 31, + "text": [ + " 2013 2014 2015\n", + "MD NaN 4.0 4.1\n", + "VA 5.1 5.2 NaN\n", + "\n", + "[2 rows x 3 columns]" + ] + } + ], + "prompt_number": 31 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a DataFrame from a dict of Series:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data_2 = {'VA' : frame_4['VA'][1:],\n", + " 'MD' : frame_4['MD'][2:]}\n", + "frame_5 = DataFrame(data_2)\n", + "frame_5" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MDVA
2014 NaN 5.2
2015 4.1 NaN
\n", + "

2 rows \u00d7 2 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 32, + "text": [ + " MD VA\n", + "2014 NaN 5.2\n", + "2015 4.1 NaN\n", + "\n", + "[2 rows x 2 columns]" + ] + } + ], + "prompt_number": 32 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the DataFrame index name:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_5.index.name = 'year'\n", + "frame_5" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MDVA
year
2014 NaN 5.2
2015 4.1 NaN
\n", + "

2 rows \u00d7 2 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 37, + "text": [ + " MD VA\n", + "year \n", + "2014 NaN 5.2\n", + "2015 4.1 NaN\n", + "\n", + "[2 rows x 2 columns]" + ] + } + ], + "prompt_number": 37 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the DataFrame columns name:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_5.columns.name = 'state'\n", + "frame_5" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateMDVA
year
2014 NaN 5.2
2015 4.1 NaN
\n", + "

2 rows \u00d7 2 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 38, + "text": [ + "state MD VA\n", + "year \n", + "2014 NaN 5.2\n", + "2015 4.1 NaN\n", + "\n", + "[2 rows x 2 columns]" + ] + } + ], + "prompt_number": 38 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Return the data contained in a DataFrame as a 2D ndarray:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_5.values" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 39, + "text": [ + "array([[ nan, 5.2],\n", + " [ 4.1, nan]])" + ] + } + ], + "prompt_number": 39 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the columns are different dtypes, the 2D ndarray's dtype will accomodate all of the columns:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "frame_3.values" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 41, + "text": [ + "array([[2012, 'VA', 5.0, nan],\n", + " [2013, 'VA', 5.1, nan],\n", + " [2014, 'VA', 5.2, 6.0],\n", + " [2014, 'MD', 4.0, 6.0],\n", + " [2015, 'MD', 4.1, 6.1]], dtype=object)" + ] + } + ], + "prompt_number": 41 } ], "metadata": {}