From 1bf68e068976591f34a48d42de9ba37ff1b51f92 Mon Sep 17 00:00:00 2001 From: Donne Martin Date: Fri, 12 Jun 2015 20:51:00 -0400 Subject: [PATCH] Combined pandas notebooks until pandas I/O and pandas cleaning are further developed. --- README.md | 2 - pandas/pandas.ipynb | 891 +++++++++++++++++++++++++++++++++++++- pandas/pandas_clean.ipynb | 591 ------------------------- pandas/pandas_io.ipynb | 353 --------------- 4 files changed, 889 insertions(+), 948 deletions(-) delete mode 100644 pandas/pandas_clean.ipynb delete mode 100644 pandas/pandas_io.ipynb diff --git a/README.md b/README.md index 4bf8f8f..c91a770 100644 --- a/README.md +++ b/README.md @@ -125,8 +125,6 @@ IPython Notebook(s) demonstrating pandas functionality. | Notebook | Description | |--------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [pandas](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas.ipynb) | Software library written for data manipulation and analysis in Python. Offers data structures and operations for manipulating numerical tables and time series. | -| [pandas io](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas_io.ipynb) | Input and output operations. | -| [pandas cleaning](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas_clean.ipynb) | Data wrangling operations. |

diff --git a/pandas/pandas.ipynb b/pandas/pandas.ipynb index b0ff2b5..a758c35 100644 --- a/pandas/pandas.ipynb +++ b/pandas/pandas.ipynb @@ -15,7 +15,9 @@ "* Function Application and Mapping\n", "* Sorting and Ranking\n", "* Axis Indices with Duplicate Values\n", - "* Summarizing and Computing Descriptive Statistics" + "* Summarizing and Computing Descriptive Statistics\n", + "* Cleaning Data (Under Construction)\n", + "* Input and Output (Under Construction)" ] }, { @@ -5749,6 +5751,891 @@ "source": [ "df_6.sum(axis=1, skipna=False)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning Data (Under Construction)\n", + "* Replace\n", + "* Drop\n", + "* Concatenate" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas import Series, DataFrame\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setup a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VA 2012\n", + "1 5.1 VA 2013\n", + "2 5.2 VA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", + " 'year' : [2012, 2013, 2014, 2014, 2015],\n", + " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", + "df_1 = DataFrame(data_1)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Replace all occurrences of a string with another string, in place (no copy):" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MD 2014\n", + "4 4.1 MD 2015" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In a specified column, replace all occurrences of a string with another string, in place (no copy):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MARYLAND 2014\n", + "4 4.1 MARYLAND 2015" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", + "df_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Drop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop the 'population' column and return a copy of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateyear
0 VIRGINIA 2012
1 VIRGINIA 2013
2 VIRGINIA 2014
3 MARYLAND 2014
4 MARYLAND 2015
\n", + "
" + ], + "text/plain": [ + " state year\n", + "0 VIRGINIA 2012\n", + "1 VIRGINIA 2013\n", + "2 VIRGINIA 2014\n", + "3 MARYLAND 2014\n", + "4 MARYLAND 2015" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_2 = df_1.drop('population', axis=1)\n", + "df_2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concatenate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Concatenate two DataFrames:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 6.0 NY 2012\n", + "1 6.1 NY 2013\n", + "2 6.2 NY 2014\n", + "3 3.0 FL 2014\n", + "4 3.1 FL 2015" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", + " 'year' : [2012, 2013, 2014, 2014, 2015],\n", + " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", + "df_3 = DataFrame(data_2)\n", + "df_3" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", + "
" + ], + "text/plain": [ + " population state year\n", + "0 5.0 VIRGINIA 2012\n", + "1 5.1 VIRGINIA 2013\n", + "2 5.2 VIRGINIA 2014\n", + "3 4.0 MARYLAND 2014\n", + "4 4.1 MARYLAND 2015\n", + "0 6.0 NY 2012\n", + "1 6.1 NY 2013\n", + "2 6.2 NY 2014\n", + "3 3.0 FL 2014\n", + "4 3.1 FL 2015" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_4 = pd.concat([df_1, df_3])\n", + "df_4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input and Output (Under Construction)\n", + "* Reading\n", + "* Writing" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas import Series, DataFrame\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read data from a CSV file into a DataFrame (use sep='\\t' for TSV):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_1 = pd.read_csv(\"../data/ozone.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get a summary of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OzoneSolar.RWindTempMonthDay
count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000
mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922
std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520
min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000
25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000
50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000
75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000
max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000
\n", + "
" + ], + "text/plain": [ + " Ozone Solar.R Wind Temp Month Day\n", + "count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000\n", + "mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922\n", + "std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520\n", + "min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000\n", + "25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000\n", + "50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000\n", + "75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000\n", + "max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List the first five rows of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OzoneSolar.RWindTempMonthDay
0 41 190 7.4 67 5 1
1 36 118 8.0 72 5 2
2 12 149 12.6 74 5 3
3 18 313 11.5 62 5 4
4NaN NaN 14.3 56 5 5
\n", + "
" + ], + "text/plain": [ + " Ozone Solar.R Wind Temp Month Day\n", + "0 41 190 7.4 67 5 1\n", + "1 36 118 8.0 72 5 2\n", + "2 12 149 12.6 74 5 3\n", + "3 18 313 11.5 62 5 4\n", + "4 NaN NaN 14.3 56 5 5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_1.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_1.to_csv('../data/ozone_copy.csv', \n", + " encoding='utf-8', \n", + " index=False, \n", + " header=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View the data directory:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 16\r\n", + "-rw-r--r--@ 1 dmartin 1443163707 2902 Dec 26 2012 ozone.csv\r\n", + "-rw-r--r-- 1 dmartin 1443163707 3324 Feb 14 06:40 ozone_copy.csv\r\n" + ] + } + ], + "source": [ + "!ls -l ../data/" + ] } ], "metadata": { @@ -5767,7 +6654,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.9" + "version": "2.7.10" } }, "nbformat": 4, diff --git a/pandas/pandas_clean.ipynb b/pandas/pandas_clean.ipynb deleted file mode 100644 index fec0430..0000000 --- a/pandas/pandas_clean.ipynb +++ /dev/null @@ -1,591 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pandas Cleaning\n", - "* Replace\n", - "* Drop\n", - "* Concatenate" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from pandas import Series, DataFrame\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Setup a DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VA 2012
1 5.1 VA 2013
2 5.2 VA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", - "
" - ], - "text/plain": [ - " population state year\n", - "0 5.0 VA 2012\n", - "1 5.1 VA 2013\n", - "2 5.2 VA 2014\n", - "3 4.0 MD 2014\n", - "4 4.1 MD 2015" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n", - " 'year' : [2012, 2013, 2014, 2014, 2015],\n", - " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n", - "df_1 = DataFrame(data_1)\n", - "df_1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Replace" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Replace all occurrences of a string with another string, in place (no copy):" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MD 2014
4 4.1 MD 2015
\n", - "
" - ], - "text/plain": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MD 2014\n", - "4 4.1 MD 2015" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_1.replace('VA', 'VIRGINIA', inplace=True)\n", - "df_1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In a specified column, replace all occurrences of a string with another string, in place (no copy):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
\n", - "
" - ], - "text/plain": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MARYLAND 2014\n", - "4 4.1 MARYLAND 2015" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n", - "df_1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Drop" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Drop the 'population' column and return a copy of the DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
stateyear
0 VIRGINIA 2012
1 VIRGINIA 2013
2 VIRGINIA 2014
3 MARYLAND 2014
4 MARYLAND 2015
\n", - "
" - ], - "text/plain": [ - " state year\n", - "0 VIRGINIA 2012\n", - "1 VIRGINIA 2013\n", - "2 VIRGINIA 2014\n", - "3 MARYLAND 2014\n", - "4 MARYLAND 2015" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_2 = df_1.drop('population', axis=1)\n", - "df_2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concatenate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Concatenate two DataFrames:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", - "
" - ], - "text/plain": [ - " population state year\n", - "0 6.0 NY 2012\n", - "1 6.1 NY 2013\n", - "2 6.2 NY 2014\n", - "3 3.0 FL 2014\n", - "4 3.1 FL 2015" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n", - " 'year' : [2012, 2013, 2014, 2014, 2015],\n", - " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n", - "df_3 = DataFrame(data_2)\n", - "df_3" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
populationstateyear
0 5.0 VIRGINIA 2012
1 5.1 VIRGINIA 2013
2 5.2 VIRGINIA 2014
3 4.0 MARYLAND 2014
4 4.1 MARYLAND 2015
0 6.0 NY 2012
1 6.1 NY 2013
2 6.2 NY 2014
3 3.0 FL 2014
4 3.1 FL 2015
\n", - "
" - ], - "text/plain": [ - " population state year\n", - "0 5.0 VIRGINIA 2012\n", - "1 5.1 VIRGINIA 2013\n", - "2 5.2 VIRGINIA 2014\n", - "3 4.0 MARYLAND 2014\n", - "4 4.1 MARYLAND 2015\n", - "0 6.0 NY 2012\n", - "1 6.1 NY 2013\n", - "2 6.2 NY 2014\n", - "3 3.0 FL 2014\n", - "4 3.1 FL 2015" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_4 = pd.concat([df_1, df_3])\n", - "df_4" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/pandas/pandas_io.ipynb b/pandas/pandas_io.ipynb deleted file mode 100644 index b9fa5cb..0000000 --- a/pandas/pandas_io.ipynb +++ /dev/null @@ -1,353 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pandas I/O\n", - "* Reading\n", - "* Writing" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from pandas import Series, DataFrame\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reading" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read data from a CSV file into a DataFrame (use sep='\\t' for TSV):" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "df_1 = pd.read_csv(\"../data/ozone.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get a summary of the DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
OzoneSolar.RWindTempMonthDay
count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000
mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922
std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520
min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000
25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000
50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000
75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000
max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000
\n", - "
" - ], - "text/plain": [ - " Ozone Solar.R Wind Temp Month Day\n", - "count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000\n", - "mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922\n", - "std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520\n", - "min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000\n", - "25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000\n", - "50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000\n", - "75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000\n", - "max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_1.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "List the first five rows of the DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
OzoneSolar.RWindTempMonthDay
0 41 190 7.4 67 5 1
1 36 118 8.0 72 5 2
2 12 149 12.6 74 5 3
3 18 313 11.5 62 5 4
4NaN NaN 14.3 56 5 5
\n", - "
" - ], - "text/plain": [ - " Ozone Solar.R Wind Temp Month Day\n", - "0 41 190 7.4 67 5 1\n", - "1 36 118 8.0 72 5 2\n", - "2 12 149 12.6 74 5 3\n", - "3 18 313 11.5 62 5 4\n", - "4 NaN NaN 14.3 56 5 5" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_1.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "df_1.to_csv('../data/ozone_copy.csv', \n", - " encoding='utf-8', \n", - " index=False, \n", - " header=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "View the data directory:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 16\r\n", - "-rw-r--r--@ 1 dmartin 1443163707 2902 Dec 26 2012 ozone.csv\r\n", - "-rw-r--r-- 1 dmartin 1443163707 3324 Feb 14 06:40 ozone_copy.csv\r\n" - ] - } - ], - "source": [ - "!ls -l ../data/" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}