{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pandas Cleaning\n",
"* Replace\n",
"* Drop\n",
"* Concatenate"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pandas import Series, DataFrame\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Setup a DataFrame:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" population | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.0 | \n",
" VA | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 5.1 | \n",
" VA | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 5.2 | \n",
" VA | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" MD | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 4.1 | \n",
" MD | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" population state year\n",
"0 5.0 VA 2012\n",
"1 5.1 VA 2013\n",
"2 5.2 VA 2014\n",
"3 4.0 MD 2014\n",
"4 4.1 MD 2015"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n",
" 'year' : [2012, 2013, 2014, 2014, 2015],\n",
" 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n",
"df_1 = DataFrame(data_1)\n",
"df_1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Replace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Replace all occurrences of a string with another string, in place (no copy):"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" population | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.0 | \n",
" VIRGINIA | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 5.1 | \n",
" VIRGINIA | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 5.2 | \n",
" VIRGINIA | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" MD | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 4.1 | \n",
" MD | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" population state year\n",
"0 5.0 VIRGINIA 2012\n",
"1 5.1 VIRGINIA 2013\n",
"2 5.2 VIRGINIA 2014\n",
"3 4.0 MD 2014\n",
"4 4.1 MD 2015"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_1.replace('VA', 'VIRGINIA', inplace=True)\n",
"df_1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In a specified column, replace all occurrences of a string with another string, in place (no copy):"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" population | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.0 | \n",
" VIRGINIA | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 5.1 | \n",
" VIRGINIA | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 5.2 | \n",
" VIRGINIA | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" MARYLAND | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 4.1 | \n",
" MARYLAND | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" population state year\n",
"0 5.0 VIRGINIA 2012\n",
"1 5.1 VIRGINIA 2013\n",
"2 5.2 VIRGINIA 2014\n",
"3 4.0 MARYLAND 2014\n",
"4 4.1 MARYLAND 2015"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n",
"df_1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Drop"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Drop the 'population' column and return a copy of the DataFrame:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" VIRGINIA | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" VIRGINIA | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" VIRGINIA | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" MARYLAND | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" MARYLAND | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" state year\n",
"0 VIRGINIA 2012\n",
"1 VIRGINIA 2013\n",
"2 VIRGINIA 2014\n",
"3 MARYLAND 2014\n",
"4 MARYLAND 2015"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_2 = df_1.drop('population', axis=1)\n",
"df_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Concatenate"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate two DataFrames:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" population | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6.0 | \n",
" NY | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 6.1 | \n",
" NY | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 6.2 | \n",
" NY | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 3.0 | \n",
" FL | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 3.1 | \n",
" FL | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" population state year\n",
"0 6.0 NY 2012\n",
"1 6.1 NY 2013\n",
"2 6.2 NY 2014\n",
"3 3.0 FL 2014\n",
"4 3.1 FL 2015"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n",
" 'year' : [2012, 2013, 2014, 2014, 2015],\n",
" 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n",
"df_3 = DataFrame(data_2)\n",
"df_3"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" population | \n",
" state | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.0 | \n",
" VIRGINIA | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 5.1 | \n",
" VIRGINIA | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 5.2 | \n",
" VIRGINIA | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" MARYLAND | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 4.1 | \n",
" MARYLAND | \n",
" 2015 | \n",
"
\n",
" \n",
" 0 | \n",
" 6.0 | \n",
" NY | \n",
" 2012 | \n",
"
\n",
" \n",
" 1 | \n",
" 6.1 | \n",
" NY | \n",
" 2013 | \n",
"
\n",
" \n",
" 2 | \n",
" 6.2 | \n",
" NY | \n",
" 2014 | \n",
"
\n",
" \n",
" 3 | \n",
" 3.0 | \n",
" FL | \n",
" 2014 | \n",
"
\n",
" \n",
" 4 | \n",
" 3.1 | \n",
" FL | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" population state year\n",
"0 5.0 VIRGINIA 2012\n",
"1 5.1 VIRGINIA 2013\n",
"2 5.2 VIRGINIA 2014\n",
"3 4.0 MARYLAND 2014\n",
"4 4.1 MARYLAND 2015\n",
"0 6.0 NY 2012\n",
"1 6.1 NY 2013\n",
"2 6.2 NY 2014\n",
"3 3.0 FL 2014\n",
"4 3.1 FL 2015"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_4 = pd.concat([df_1, df_3])\n",
"df_4"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}