diff --git a/README.md b/README.md
index 4bf8f8f..c91a770 100644
--- a/README.md
+++ b/README.md
@@ -125,8 +125,6 @@ IPython Notebook(s) demonstrating pandas functionality.
| Notebook | Description |
|--------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [pandas](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas.ipynb) | Software library written for data manipulation and analysis in Python. Offers data structures and operations for manipulating numerical tables and time series. |
-| [pandas io](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas_io.ipynb) | Input and output operations. |
-| [pandas cleaning](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/pandas/pandas_clean.ipynb) | Data wrangling operations. |
diff --git a/pandas/pandas.ipynb b/pandas/pandas.ipynb
index b0ff2b5..a758c35 100644
--- a/pandas/pandas.ipynb
+++ b/pandas/pandas.ipynb
@@ -15,7 +15,9 @@
"* Function Application and Mapping\n",
"* Sorting and Ranking\n",
"* Axis Indices with Duplicate Values\n",
- "* Summarizing and Computing Descriptive Statistics"
+ "* Summarizing and Computing Descriptive Statistics\n",
+ "* Cleaning Data (Under Construction)\n",
+ "* Input and Output (Under Construction)"
]
},
{
@@ -5749,6 +5751,891 @@
"source": [
"df_6.sum(axis=1, skipna=False)"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Cleaning Data (Under Construction)\n",
+ "* Replace\n",
+ "* Drop\n",
+ "* Concatenate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from pandas import Series, DataFrame\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Setup a DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " population | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.0 | \n",
+ " VA | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5.1 | \n",
+ " VA | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5.2 | \n",
+ " VA | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.0 | \n",
+ " MD | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4.1 | \n",
+ " MD | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " population state year\n",
+ "0 5.0 VA 2012\n",
+ "1 5.1 VA 2013\n",
+ "2 5.2 VA 2014\n",
+ "3 4.0 MD 2014\n",
+ "4 4.1 MD 2015"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n",
+ " 'year' : [2012, 2013, 2014, 2014, 2015],\n",
+ " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n",
+ "df_1 = DataFrame(data_1)\n",
+ "df_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Replace"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Replace all occurrences of a string with another string, in place (no copy):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " population | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.0 | \n",
+ " VIRGINIA | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5.1 | \n",
+ " VIRGINIA | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5.2 | \n",
+ " VIRGINIA | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.0 | \n",
+ " MD | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4.1 | \n",
+ " MD | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " population state year\n",
+ "0 5.0 VIRGINIA 2012\n",
+ "1 5.1 VIRGINIA 2013\n",
+ "2 5.2 VIRGINIA 2014\n",
+ "3 4.0 MD 2014\n",
+ "4 4.1 MD 2015"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.replace('VA', 'VIRGINIA', inplace=True)\n",
+ "df_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In a specified column, replace all occurrences of a string with another string, in place (no copy):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " population | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.0 | \n",
+ " VIRGINIA | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5.1 | \n",
+ " VIRGINIA | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5.2 | \n",
+ " VIRGINIA | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.0 | \n",
+ " MARYLAND | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4.1 | \n",
+ " MARYLAND | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " population state year\n",
+ "0 5.0 VIRGINIA 2012\n",
+ "1 5.1 VIRGINIA 2013\n",
+ "2 5.2 VIRGINIA 2014\n",
+ "3 4.0 MARYLAND 2014\n",
+ "4 4.1 MARYLAND 2015"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n",
+ "df_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Drop"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Drop the 'population' column and return a copy of the DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " VIRGINIA | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " VIRGINIA | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " VIRGINIA | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " MARYLAND | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MARYLAND | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " state year\n",
+ "0 VIRGINIA 2012\n",
+ "1 VIRGINIA 2013\n",
+ "2 VIRGINIA 2014\n",
+ "3 MARYLAND 2014\n",
+ "4 MARYLAND 2015"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_2 = df_1.drop('population', axis=1)\n",
+ "df_2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Concatenate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Concatenate two DataFrames:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " population | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6.0 | \n",
+ " NY | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6.1 | \n",
+ " NY | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 6.2 | \n",
+ " NY | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3.0 | \n",
+ " FL | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3.1 | \n",
+ " FL | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " population state year\n",
+ "0 6.0 NY 2012\n",
+ "1 6.1 NY 2013\n",
+ "2 6.2 NY 2014\n",
+ "3 3.0 FL 2014\n",
+ "4 3.1 FL 2015"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n",
+ " 'year' : [2012, 2013, 2014, 2014, 2015],\n",
+ " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n",
+ "df_3 = DataFrame(data_2)\n",
+ "df_3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " population | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.0 | \n",
+ " VIRGINIA | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5.1 | \n",
+ " VIRGINIA | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5.2 | \n",
+ " VIRGINIA | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.0 | \n",
+ " MARYLAND | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4.1 | \n",
+ " MARYLAND | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " 6.0 | \n",
+ " NY | \n",
+ " 2012 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6.1 | \n",
+ " NY | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 6.2 | \n",
+ " NY | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3.0 | \n",
+ " FL | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3.1 | \n",
+ " FL | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " population state year\n",
+ "0 5.0 VIRGINIA 2012\n",
+ "1 5.1 VIRGINIA 2013\n",
+ "2 5.2 VIRGINIA 2014\n",
+ "3 4.0 MARYLAND 2014\n",
+ "4 4.1 MARYLAND 2015\n",
+ "0 6.0 NY 2012\n",
+ "1 6.1 NY 2013\n",
+ "2 6.2 NY 2014\n",
+ "3 3.0 FL 2014\n",
+ "4 3.1 FL 2015"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_4 = pd.concat([df_1, df_3])\n",
+ "df_4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Input and Output (Under Construction)\n",
+ "* Reading\n",
+ "* Writing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from pandas import Series, DataFrame\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Reading"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Read data from a CSV file into a DataFrame (use sep='\\t' for TSV):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df_1 = pd.read_csv(\"../data/ozone.csv\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Get a summary of the DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Ozone | \n",
+ " Solar.R | \n",
+ " Wind | \n",
+ " Temp | \n",
+ " Month | \n",
+ " Day | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 116.000000 | \n",
+ " 146.000000 | \n",
+ " 153.000000 | \n",
+ " 153.000000 | \n",
+ " 153.000000 | \n",
+ " 153.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 42.129310 | \n",
+ " 185.931507 | \n",
+ " 9.957516 | \n",
+ " 77.882353 | \n",
+ " 6.993464 | \n",
+ " 15.803922 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 32.987885 | \n",
+ " 90.058422 | \n",
+ " 3.523001 | \n",
+ " 9.465270 | \n",
+ " 1.416522 | \n",
+ " 8.864520 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " 7.000000 | \n",
+ " 1.700000 | \n",
+ " 56.000000 | \n",
+ " 5.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 18.000000 | \n",
+ " 115.750000 | \n",
+ " 7.400000 | \n",
+ " 72.000000 | \n",
+ " 6.000000 | \n",
+ " 8.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 31.500000 | \n",
+ " 205.000000 | \n",
+ " 9.700000 | \n",
+ " 79.000000 | \n",
+ " 7.000000 | \n",
+ " 16.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 63.250000 | \n",
+ " 258.750000 | \n",
+ " 11.500000 | \n",
+ " 85.000000 | \n",
+ " 8.000000 | \n",
+ " 23.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 168.000000 | \n",
+ " 334.000000 | \n",
+ " 20.700000 | \n",
+ " 97.000000 | \n",
+ " 9.000000 | \n",
+ " 31.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Ozone Solar.R Wind Temp Month Day\n",
+ "count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000\n",
+ "mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922\n",
+ "std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520\n",
+ "min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000\n",
+ "25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000\n",
+ "50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000\n",
+ "75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000\n",
+ "max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "List the first five rows of the DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Ozone | \n",
+ " Solar.R | \n",
+ " Wind | \n",
+ " Temp | \n",
+ " Month | \n",
+ " Day | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41 | \n",
+ " 190 | \n",
+ " 7.4 | \n",
+ " 67 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 36 | \n",
+ " 118 | \n",
+ " 8.0 | \n",
+ " 72 | \n",
+ " 5 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 12 | \n",
+ " 149 | \n",
+ " 12.6 | \n",
+ " 74 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 18 | \n",
+ " 313 | \n",
+ " 11.5 | \n",
+ " 62 | \n",
+ " 5 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 14.3 | \n",
+ " 56 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Ozone Solar.R Wind Temp Month Day\n",
+ "0 41 190 7.4 67 5 1\n",
+ "1 36 118 8.0 72 5 2\n",
+ "2 12 149 12.6 74 5 3\n",
+ "3 18 313 11.5 62 5 4\n",
+ "4 NaN NaN 14.3 56 5 5"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Writing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df_1.to_csv('../data/ozone_copy.csv', \n",
+ " encoding='utf-8', \n",
+ " index=False, \n",
+ " header=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "View the data directory:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total 16\r\n",
+ "-rw-r--r--@ 1 dmartin 1443163707 2902 Dec 26 2012 ozone.csv\r\n",
+ "-rw-r--r-- 1 dmartin 1443163707 3324 Feb 14 06:40 ozone_copy.csv\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls -l ../data/"
+ ]
}
],
"metadata": {
@@ -5767,7 +6654,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
- "version": "2.7.9"
+ "version": "2.7.10"
}
},
"nbformat": 4,
diff --git a/pandas/pandas_clean.ipynb b/pandas/pandas_clean.ipynb
deleted file mode 100644
index fec0430..0000000
--- a/pandas/pandas_clean.ipynb
+++ /dev/null
@@ -1,591 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Pandas Cleaning\n",
- "* Replace\n",
- "* Drop\n",
- "* Concatenate"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from pandas import Series, DataFrame\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Setup a DataFrame:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " population | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.0 | \n",
- " VA | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 5.1 | \n",
- " VA | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5.2 | \n",
- " VA | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.0 | \n",
- " MD | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4.1 | \n",
- " MD | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " population state year\n",
- "0 5.0 VA 2012\n",
- "1 5.1 VA 2013\n",
- "2 5.2 VA 2014\n",
- "3 4.0 MD 2014\n",
- "4 4.1 MD 2015"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],\n",
- " 'year' : [2012, 2013, 2014, 2014, 2015],\n",
- " 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}\n",
- "df_1 = DataFrame(data_1)\n",
- "df_1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Replace"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Replace all occurrences of a string with another string, in place (no copy):"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " population | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.0 | \n",
- " VIRGINIA | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 5.1 | \n",
- " VIRGINIA | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5.2 | \n",
- " VIRGINIA | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.0 | \n",
- " MD | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4.1 | \n",
- " MD | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " population state year\n",
- "0 5.0 VIRGINIA 2012\n",
- "1 5.1 VIRGINIA 2013\n",
- "2 5.2 VIRGINIA 2014\n",
- "3 4.0 MD 2014\n",
- "4 4.1 MD 2015"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_1.replace('VA', 'VIRGINIA', inplace=True)\n",
- "df_1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In a specified column, replace all occurrences of a string with another string, in place (no copy):"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " population | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.0 | \n",
- " VIRGINIA | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 5.1 | \n",
- " VIRGINIA | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5.2 | \n",
- " VIRGINIA | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.0 | \n",
- " MARYLAND | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4.1 | \n",
- " MARYLAND | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " population state year\n",
- "0 5.0 VIRGINIA 2012\n",
- "1 5.1 VIRGINIA 2013\n",
- "2 5.2 VIRGINIA 2014\n",
- "3 4.0 MARYLAND 2014\n",
- "4 4.1 MARYLAND 2015"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True)\n",
- "df_1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Drop"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Drop the 'population' column and return a copy of the DataFrame:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " VIRGINIA | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " VIRGINIA | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " VIRGINIA | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " MARYLAND | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " MARYLAND | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " state year\n",
- "0 VIRGINIA 2012\n",
- "1 VIRGINIA 2013\n",
- "2 VIRGINIA 2014\n",
- "3 MARYLAND 2014\n",
- "4 MARYLAND 2015"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_2 = df_1.drop('population', axis=1)\n",
- "df_2"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Concatenate"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Concatenate two DataFrames:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " population | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 6.0 | \n",
- " NY | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 6.1 | \n",
- " NY | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 6.2 | \n",
- " NY | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3.0 | \n",
- " FL | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 3.1 | \n",
- " FL | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " population state year\n",
- "0 6.0 NY 2012\n",
- "1 6.1 NY 2013\n",
- "2 6.2 NY 2014\n",
- "3 3.0 FL 2014\n",
- "4 3.1 FL 2015"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],\n",
- " 'year' : [2012, 2013, 2014, 2014, 2015],\n",
- " 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}\n",
- "df_3 = DataFrame(data_2)\n",
- "df_3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " population | \n",
- " state | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.0 | \n",
- " VIRGINIA | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 5.1 | \n",
- " VIRGINIA | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5.2 | \n",
- " VIRGINIA | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.0 | \n",
- " MARYLAND | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4.1 | \n",
- " MARYLAND | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- " 0 | \n",
- " 6.0 | \n",
- " NY | \n",
- " 2012 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 6.1 | \n",
- " NY | \n",
- " 2013 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 6.2 | \n",
- " NY | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3.0 | \n",
- " FL | \n",
- " 2014 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 3.1 | \n",
- " FL | \n",
- " 2015 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " population state year\n",
- "0 5.0 VIRGINIA 2012\n",
- "1 5.1 VIRGINIA 2013\n",
- "2 5.2 VIRGINIA 2014\n",
- "3 4.0 MARYLAND 2014\n",
- "4 4.1 MARYLAND 2015\n",
- "0 6.0 NY 2012\n",
- "1 6.1 NY 2013\n",
- "2 6.2 NY 2014\n",
- "3 3.0 FL 2014\n",
- "4 3.1 FL 2015"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_4 = pd.concat([df_1, df_3])\n",
- "df_4"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/pandas/pandas_io.ipynb b/pandas/pandas_io.ipynb
deleted file mode 100644
index b9fa5cb..0000000
--- a/pandas/pandas_io.ipynb
+++ /dev/null
@@ -1,353 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Pandas I/O\n",
- "* Reading\n",
- "* Writing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from pandas import Series, DataFrame\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Reading"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Read data from a CSV file into a DataFrame (use sep='\\t' for TSV):"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "df_1 = pd.read_csv(\"../data/ozone.csv\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Get a summary of the DataFrame:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Ozone | \n",
- " Solar.R | \n",
- " Wind | \n",
- " Temp | \n",
- " Month | \n",
- " Day | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 116.000000 | \n",
- " 146.000000 | \n",
- " 153.000000 | \n",
- " 153.000000 | \n",
- " 153.000000 | \n",
- " 153.000000 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " 42.129310 | \n",
- " 185.931507 | \n",
- " 9.957516 | \n",
- " 77.882353 | \n",
- " 6.993464 | \n",
- " 15.803922 | \n",
- "
\n",
- " \n",
- " std | \n",
- " 32.987885 | \n",
- " 90.058422 | \n",
- " 3.523001 | \n",
- " 9.465270 | \n",
- " 1.416522 | \n",
- " 8.864520 | \n",
- "
\n",
- " \n",
- " min | \n",
- " 1.000000 | \n",
- " 7.000000 | \n",
- " 1.700000 | \n",
- " 56.000000 | \n",
- " 5.000000 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " 18.000000 | \n",
- " 115.750000 | \n",
- " 7.400000 | \n",
- " 72.000000 | \n",
- " 6.000000 | \n",
- " 8.000000 | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " 31.500000 | \n",
- " 205.000000 | \n",
- " 9.700000 | \n",
- " 79.000000 | \n",
- " 7.000000 | \n",
- " 16.000000 | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " 63.250000 | \n",
- " 258.750000 | \n",
- " 11.500000 | \n",
- " 85.000000 | \n",
- " 8.000000 | \n",
- " 23.000000 | \n",
- "
\n",
- " \n",
- " max | \n",
- " 168.000000 | \n",
- " 334.000000 | \n",
- " 20.700000 | \n",
- " 97.000000 | \n",
- " 9.000000 | \n",
- " 31.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Ozone Solar.R Wind Temp Month Day\n",
- "count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000\n",
- "mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922\n",
- "std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520\n",
- "min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000\n",
- "25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000\n",
- "50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000\n",
- "75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000\n",
- "max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_1.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "List the first five rows of the DataFrame:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Ozone | \n",
- " Solar.R | \n",
- " Wind | \n",
- " Temp | \n",
- " Month | \n",
- " Day | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 41 | \n",
- " 190 | \n",
- " 7.4 | \n",
- " 67 | \n",
- " 5 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 36 | \n",
- " 118 | \n",
- " 8.0 | \n",
- " 72 | \n",
- " 5 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 12 | \n",
- " 149 | \n",
- " 12.6 | \n",
- " 74 | \n",
- " 5 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 18 | \n",
- " 313 | \n",
- " 11.5 | \n",
- " 62 | \n",
- " 5 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " NaN | \n",
- " NaN | \n",
- " 14.3 | \n",
- " 56 | \n",
- " 5 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Ozone Solar.R Wind Temp Month Day\n",
- "0 41 190 7.4 67 5 1\n",
- "1 36 118 8.0 72 5 2\n",
- "2 12 149 12.6 74 5 3\n",
- "3 18 313 11.5 62 5 4\n",
- "4 NaN NaN 14.3 56 5 5"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_1.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Writing"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "df_1.to_csv('../data/ozone_copy.csv', \n",
- " encoding='utf-8', \n",
- " index=False, \n",
- " header=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "View the data directory:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "total 16\r\n",
- "-rw-r--r--@ 1 dmartin 1443163707 2902 Dec 26 2012 ozone.csv\r\n",
- "-rw-r--r-- 1 dmartin 1443163707 3324 Feb 14 06:40 ozone_copy.csv\r\n"
- ]
- }
- ],
- "source": [
- "!ls -l ../data/"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}