{ "metadata": { "name": "", "signature": "sha256:1c8b7cab9b55eb5888612d0b5149649565c258456e73e61b039225439aa11502" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas Cleaning\n", "* Clean\n", "* Transform\n", "* Merge\n", "* Reshape\n", "* Concatenate" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas import Series, DataFrame\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check for matching values in a specific column for replacement:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", " 'year' : [2012, 2013, 2014, 2014, 2015],\n", " 'pop' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", "df_1 = DataFrame(data_1)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ " pop state year\n", "0 5.0 VA 2012\n", "1 5.1 VA 2013\n", "2 5.2 VA 2014\n", "3 4.0 MD 2014\n", "4 4.1 MD 2015" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "df_1[df_1['state'] == 'VA']" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ " pop state year\n", "0 5.0 VA 2012\n", "1 5.1 VA 2013\n", "2 5.2 VA 2014" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Replace all occurrences of a string with another string, in place (no copy):" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ " pop state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MD 2014\n", "4 4.1 MD 2015" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In a specified column, replace all occurrences of a string with another string, in place (no copy):" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", "df_1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " pop state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MARYLAND 2014\n", "4 4.1 MARYLAND 2015" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate two DataFrames:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", " 'year' : [2012, 2013, 2014, 2014, 2015],\n", " 'pop' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", "df_2 = DataFrame(data_2)\n", "df_2" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " pop state year\n", "0 6.0 NY 2012\n", "1 6.1 NY 2013\n", "2 6.2 NY 2014\n", "3 3.0 FL 2014\n", "4 3.1 FL 2015" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "df_3 = pd.concat([df_1, df_2])\n", "df_3" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " pop state year\n", "0 5.0 VIRGINIA 2012\n", "1 5.1 VIRGINIA 2013\n", "2 5.2 VIRGINIA 2014\n", "3 4.0 MARYLAND 2014\n", "4 4.1 MARYLAND 2015\n", "0 6.0 NY 2012\n", "1 6.1 NY 2013\n", "2 6.2 NY 2014\n", "3 3.0 FL 2014\n", "4 3.1 FL 2015" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 } ], "metadata": {} } ] }