From 89ce172c7753eafb4710201509ef202a30ea2f63 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Tue, 10 Feb 2015 17:44:21 -0500 Subject: [PATCH] Added IPython Notebook for cleaning data with Pandas. Added snippets for replacing strings. --- pandas/pandas_clean.ipynb | 256 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 pandas/pandas_clean.ipynb diff --git a/pandas/pandas_clean.ipynb b/pandas/pandas_clean.ipynb new file mode 100644 index 0000000..421e83d --- /dev/null +++ b/pandas/pandas_clean.ipynb @@ -0,0 +1,256 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:3d55aefd3368aca223546c0d26816eb99aed51fe2f81f1f2f68d7bdbcc73651d" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Cleaning\n", + "* Clean\n", + "* Transform\n", + "* Merge\n", + "* Reshape" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from pandas import Series, DataFrame\n", + "import pandas as pd" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", + " 'year' : [2012, 2013, 2014, 2014, 2015],\n", + " 'pop' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", + "df_1 = DataFrame(data_1)\n", + "df_1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 2, + "text": [ + " pop state year\n", + "0 5.0 VA 2012\n", + "1 5.1 VA 2013\n", + "2 5.2 VA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + } + ], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", + "df_1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 3, + "text": [ + " pop state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_1.replace({'state' : { 'MD' : 'MARYLAND' }})" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
popstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 6, + "text": [ + " pop state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MARYLAND 2014\n", + "4 4.1 MARYLAND 2015" + ] + } + ], + "prompt_number": 6 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file