diff --git a/pandas/pandas_clean.ipynb b/pandas/pandas_clean.ipynb index 88efc19..fec0430 100644 --- a/pandas/pandas_clean.ipynb +++ b/pandas/pandas_clean.ipynb @@ -1,569 +1,591 @@ { - "metadata": { - "name": "", - "signature": "sha256:b619f1fd1f2d4495d6a2fe9d048c09b7319b119d4e10a5b2348f0ac6f380a27c" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ + "cells": [ { - "cells": [ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Cleaning\n", + "* Replace\n", + "* Drop\n", + "* Concatenate" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas import Series, DataFrame\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setup a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VA 2012\n", + "1 5.1 VA 2013\n", + "2 5.2 VA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + }, + "execution_count": 2, "metadata": {}, - "source": [ - "# Pandas Cleaning\n", - "* Replace\n", - "* Drop\n", - "* Concatenate" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas import Series, DataFrame\n", - "import pandas as pd" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Setup a DataFrame:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", - " 'year' : [2012, 2013, 2014, 2014, 2015],\n", - " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", - "df_1 = DataFrame(data_1)\n", - "df_1" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 2, - "text": [ - " population state year\n", - "0 5.0 VA 2012\n", - "1 5.1 VA 2013\n", - "2 5.2 VA 2014\n", - "3 4.0 MD 2014\n", - "4 4.1 MD 2015" - ] - } - ], - "prompt_number": 2 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Replace" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Replace all occurrences of a string with another string, in place (no copy):" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", - "df_1" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 4, - "text": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MD 2014\n", - "4 4.1 MD 2015" - ] - } - ], - "prompt_number": 4 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In a specified column, replace all occurrences of a string with another string, in place (no copy):" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", - "df_1" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 5, - "text": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MARYLAND 2014\n", - "4 4.1 MARYLAND 2015" - ] - } - ], - "prompt_number": 5 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Drop" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Drop the 'population' column and return a copy of the DataFrame:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_2 = df_1.drop('population', axis=1)\n", - "df_2" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
stateyear
0 VIRGINIA 2012
1 VIRGINIA 2013
2 VIRGINIA 2014
3 MARYLAND 2014
4 MARYLAND 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 6, - "text": [ - " state year\n", - "0 VIRGINIA 2012\n", - "1 VIRGINIA 2013\n", - "2 VIRGINIA 2014\n", - "3 MARYLAND 2014\n", - "4 MARYLAND 2015" - ] - } - ], - "prompt_number": 6 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concatenate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Concatenate two DataFrames:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", - " 'year' : [2012, 2013, 2014, 2014, 2015],\n", - " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", - "df_3 = DataFrame(data_2)\n", - "df_3" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 7, - "text": [ - " population state year\n", - "0 6.0 NY 2012\n", - "1 6.1 NY 2013\n", - "2 6.2 NY 2014\n", - "3 3.0 FL 2014\n", - "4 3.1 FL 2015" - ] - } - ], - "prompt_number": 7 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_4 = pd.concat([df_1, df_3])\n", - "df_4" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", - "
" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 8, - "text": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MARYLAND 2014\n", - "4 4.1 MARYLAND 2015\n", - "0 6.0 NY 2012\n", - "1 6.1 NY 2013\n", - "2 6.2 NY 2014\n", - "3 3.0 FL 2014\n", - "4 3.1 FL 2015" - ] - } - ], - "prompt_number": 8 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 8 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", + " 'year' : [2012, 2013, 2014, 2014, 2015],\n", + " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", + "df_1 = DataFrame(data_1)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Replace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Replace all occurrences of a string with another string, in place (no copy):" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In a specified column, replace all occurrences of a string with another string, in place (no copy):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MARYLAND 2014\n", + "4 4.1 MARYLAND 2015" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Drop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop the 'population' column and return a copy of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateyear
0 VIRGINIA 2012
1 VIRGINIA 2013
2 VIRGINIA 2014
3 MARYLAND 2014
4 MARYLAND 2015
\n", + "
" + ], + "text/plain": [ + " state year\n", + "0 VIRGINIA 2012\n", + "1 VIRGINIA 2013\n", + "2 VIRGINIA 2014\n", + "3 MARYLAND 2014\n", + "4 MARYLAND 2015" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_2 = df_1.drop('population', axis=1)\n", + "df_2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Concatenate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Concatenate two DataFrames:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 6.0 NY 2012\n", + "1 6.1 NY 2013\n", + "2 6.2 NY 2014\n", + "3 3.0 FL 2014\n", + "4 3.1 FL 2015" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", + " 'year' : [2012, 2013, 2014, 2014, 2015],\n", + " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", + "df_3 = DataFrame(data_2)\n", + "df_3" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MARYLAND 2014\n", + "4 4.1 MARYLAND 2015\n", + "0 6.0 NY 2012\n", + "1 6.1 NY 2013\n", + "2 6.2 NY 2014\n", + "3 3.0 FL 2014\n", + "4 3.1 FL 2015" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_4 = pd.concat([df_1, df_3])\n", + "df_4" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}