{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas Cleaning\n",
    "* Replace\n",
    "* Drop\n",
    "* Concatenate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pandas import Series, DataFrame\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Setup a DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 5.0</td>\n",
       "      <td> VA</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 5.1</td>\n",
       "      <td> VA</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 5.2</td>\n",
       "      <td> VA</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 4.0</td>\n",
       "      <td> MD</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 4.1</td>\n",
       "      <td> MD</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   population state  year\n",
       "0         5.0    VA  2012\n",
       "1         5.1    VA  2013\n",
       "2         5.2    VA  2014\n",
       "3         4.0    MD  2014\n",
       "4         4.1    MD  2015"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n",
    "          'year' : [2012, 2013, 2014, 2014, 2015],\n",
    "          'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n",
    "df_1 = DataFrame(data_1)\n",
    "df_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replace"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Replace all occurrences of a string with another string, in place (no copy):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 5.0</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 5.1</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 5.2</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 4.0</td>\n",
       "      <td>       MD</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 4.1</td>\n",
       "      <td>       MD</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   population     state  year\n",
       "0         5.0  VIRGINIA  2012\n",
       "1         5.1  VIRGINIA  2013\n",
       "2         5.2  VIRGINIA  2014\n",
       "3         4.0        MD  2014\n",
       "4         4.1        MD  2015"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_1.replace('VA', 'VIRGINIA', inplace=True)\n",
    "df_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In a specified column, replace all occurrences of a string with another string, in place (no copy):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 5.0</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 5.1</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 5.2</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 4.0</td>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 4.1</td>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   population     state  year\n",
       "0         5.0  VIRGINIA  2012\n",
       "1         5.1  VIRGINIA  2013\n",
       "2         5.2  VIRGINIA  2014\n",
       "3         4.0  MARYLAND  2014\n",
       "4         4.1  MARYLAND  2015"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n",
    "df_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Drop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop the 'population' column and return a copy of the DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      state  year\n",
       "0  VIRGINIA  2012\n",
       "1  VIRGINIA  2013\n",
       "2  VIRGINIA  2014\n",
       "3  MARYLAND  2014\n",
       "4  MARYLAND  2015"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_2 = df_1.drop('population', axis=1)\n",
    "df_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Concatenate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Concatenate two DataFrames:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 6.0</td>\n",
       "      <td> NY</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 6.1</td>\n",
       "      <td> NY</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 6.2</td>\n",
       "      <td> NY</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 3.0</td>\n",
       "      <td> FL</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 3.1</td>\n",
       "      <td> FL</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   population state  year\n",
       "0         6.0    NY  2012\n",
       "1         6.1    NY  2013\n",
       "2         6.2    NY  2014\n",
       "3         3.0    FL  2014\n",
       "4         3.1    FL  2015"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n",
    "          'year' : [2012, 2013, 2014, 2014, 2015],\n",
    "          'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n",
    "df_3 = DataFrame(data_2)\n",
    "df_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 5.0</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 5.1</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 5.2</td>\n",
       "      <td> VIRGINIA</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 4.0</td>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 4.1</td>\n",
       "      <td> MARYLAND</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 6.0</td>\n",
       "      <td>       NY</td>\n",
       "      <td> 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 6.1</td>\n",
       "      <td>       NY</td>\n",
       "      <td> 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 6.2</td>\n",
       "      <td>       NY</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 3.0</td>\n",
       "      <td>       FL</td>\n",
       "      <td> 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 3.1</td>\n",
       "      <td>       FL</td>\n",
       "      <td> 2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   population     state  year\n",
       "0         5.0  VIRGINIA  2012\n",
       "1         5.1  VIRGINIA  2013\n",
       "2         5.2  VIRGINIA  2014\n",
       "3         4.0  MARYLAND  2014\n",
       "4         4.1  MARYLAND  2015\n",
       "0         6.0        NY  2012\n",
       "1         6.1        NY  2013\n",
       "2         6.2        NY  2014\n",
       "3         3.0        FL  2014\n",
       "4         3.1        FL  2015"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_4 = pd.concat([df_1, df_3])\n",
    "df_4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}