{ "metadata": { "name": "", "signature": "sha256:b619f1fd1f2d4495d6a2fe9d048c09b7319b119d4e10a5b2348f0ac6f380a27c" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas Cleaning\n", "* Replace\n", "* Drop\n", "* Concatenate" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas import Series, DataFrame\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setup a DataFrame:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", " 'year' : [2012, 2013, 2014, 2014, 2015],\n", " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", "df_1 = DataFrame(data_1)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
populationstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ " population state year\n", "0 5.0 VA 2012\n", "1 5.1 VA 2013\n", "2 5.2 VA 2014\n", "3 4.0 MD 2014\n", "4 4.1 MD 2015" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Replace" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Replace all occurrences of a string with another string, in place (no copy):" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ " population state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MD 2014\n", "4 4.1 MD 2015" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In a specified column, replace all occurrences of a string with another string, in place (no copy):" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " population state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MARYLAND 2014\n", "4 4.1 MARYLAND 2015" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop the 'population' column and return a copy of the DataFrame:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_2 = df_1.drop('population', axis=1)\n", "df_2" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyear
0 VIRGINIA 2012
1 VIRGINIA 2013
2 VIRGINIA 2014
3 MARYLAND 2014
4 MARYLAND 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " state year\n", "0 VIRGINIA 2012\n", "1 VIRGINIA 2013\n", "2 VIRGINIA 2014\n", "3 MARYLAND 2014\n", "4 MARYLAND 2015" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concatenate" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate two DataFrames:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", " 'year' : [2012, 2013, 2014, 2014, 2015],\n", " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", "df_3 = DataFrame(data_2)\n", "df_3" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
populationstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " population state year\n", "0 6.0 NY 2012\n", "1 6.1 NY 2013\n", "2 6.2 NY 2014\n", "3 3.0 FL 2014\n", "4 3.1 FL 2015" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "df_4 = pd.concat([df_1, df_3])\n", "df_4" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " population state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MARYLAND 2014\n", "4 4.1 MARYLAND 2015\n", "0 6.0 NY 2012\n", "1 6.1 NY 2013\n", "2 6.2 NY 2014\n", "3 3.0 FL 2014\n", "4 3.1 FL 2015" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 } ], "metadata": {} } ] }