mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
325 lines
9.1 KiB
Plaintext
325 lines
9.1 KiB
Plaintext
{
|
|
"metadata": {
|
|
"name": "",
|
|
"signature": "sha256:da65d8daa07d931a0cc29a47c0b548fe12fdef8b547eb73d08edd31f44a3df38"
|
|
},
|
|
"nbformat": 3,
|
|
"nbformat_minor": 0,
|
|
"worksheets": [
|
|
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pandas I/O\n",
|
|
"* Reading\n",
|
|
"* Writing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"from pandas import Series, DataFrame\n",
|
|
"import pandas as pd"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"prompt_number": 1
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Read data from a CSV file into a DataFrame (use sep='\\t' for TSV):"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"df_1 = pd.read_csv(\"../data/ozone.csv\")"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"prompt_number": 2
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Get a summary of the DataFrame:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"df_1.describe()"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"html": [
|
|
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Ozone</th>\n",
|
|
" <th>Solar.R</th>\n",
|
|
" <th>Wind</th>\n",
|
|
" <th>Temp</th>\n",
|
|
" <th>Month</th>\n",
|
|
" <th>Day</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td> 116.000000</td>\n",
|
|
" <td> 146.000000</td>\n",
|
|
" <td> 153.000000</td>\n",
|
|
" <td> 153.000000</td>\n",
|
|
" <td> 153.000000</td>\n",
|
|
" <td> 153.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td> 42.129310</td>\n",
|
|
" <td> 185.931507</td>\n",
|
|
" <td> 9.957516</td>\n",
|
|
" <td> 77.882353</td>\n",
|
|
" <td> 6.993464</td>\n",
|
|
" <td> 15.803922</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td> 32.987885</td>\n",
|
|
" <td> 90.058422</td>\n",
|
|
" <td> 3.523001</td>\n",
|
|
" <td> 9.465270</td>\n",
|
|
" <td> 1.416522</td>\n",
|
|
" <td> 8.864520</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td> 1.000000</td>\n",
|
|
" <td> 7.000000</td>\n",
|
|
" <td> 1.700000</td>\n",
|
|
" <td> 56.000000</td>\n",
|
|
" <td> 5.000000</td>\n",
|
|
" <td> 1.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td> 18.000000</td>\n",
|
|
" <td> 115.750000</td>\n",
|
|
" <td> 7.400000</td>\n",
|
|
" <td> 72.000000</td>\n",
|
|
" <td> 6.000000</td>\n",
|
|
" <td> 8.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td> 31.500000</td>\n",
|
|
" <td> 205.000000</td>\n",
|
|
" <td> 9.700000</td>\n",
|
|
" <td> 79.000000</td>\n",
|
|
" <td> 7.000000</td>\n",
|
|
" <td> 16.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td> 63.250000</td>\n",
|
|
" <td> 258.750000</td>\n",
|
|
" <td> 11.500000</td>\n",
|
|
" <td> 85.000000</td>\n",
|
|
" <td> 8.000000</td>\n",
|
|
" <td> 23.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td> 168.000000</td>\n",
|
|
" <td> 334.000000</td>\n",
|
|
" <td> 20.700000</td>\n",
|
|
" <td> 97.000000</td>\n",
|
|
" <td> 9.000000</td>\n",
|
|
" <td> 31.000000</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 3,
|
|
"text": [
|
|
" Ozone Solar.R Wind Temp Month Day\n",
|
|
"count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000\n",
|
|
"mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922\n",
|
|
"std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520\n",
|
|
"min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000\n",
|
|
"25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000\n",
|
|
"50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000\n",
|
|
"75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000\n",
|
|
"max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 3
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"List the first five rows of the DataFrame:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"df_1.head()"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"html": [
|
|
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Ozone</th>\n",
|
|
" <th>Solar.R</th>\n",
|
|
" <th>Wind</th>\n",
|
|
" <th>Temp</th>\n",
|
|
" <th>Month</th>\n",
|
|
" <th>Day</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td> 41</td>\n",
|
|
" <td> 190</td>\n",
|
|
" <td> 7.4</td>\n",
|
|
" <td> 67</td>\n",
|
|
" <td> 5</td>\n",
|
|
" <td> 1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td> 36</td>\n",
|
|
" <td> 118</td>\n",
|
|
" <td> 8.0</td>\n",
|
|
" <td> 72</td>\n",
|
|
" <td> 5</td>\n",
|
|
" <td> 2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td> 12</td>\n",
|
|
" <td> 149</td>\n",
|
|
" <td> 12.6</td>\n",
|
|
" <td> 74</td>\n",
|
|
" <td> 5</td>\n",
|
|
" <td> 3</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td> 18</td>\n",
|
|
" <td> 313</td>\n",
|
|
" <td> 11.5</td>\n",
|
|
" <td> 62</td>\n",
|
|
" <td> 5</td>\n",
|
|
" <td> 4</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td> NaN</td>\n",
|
|
" <td> 14.3</td>\n",
|
|
" <td> 56</td>\n",
|
|
" <td> 5</td>\n",
|
|
" <td> 5</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 4,
|
|
"text": [
|
|
" Ozone Solar.R Wind Temp Month Day\n",
|
|
"0 41 190 7.4 67 5 1\n",
|
|
"1 36 118 8.0 72 5 2\n",
|
|
"2 12 149 12.6 74 5 3\n",
|
|
"3 18 313 11.5 62 5 4\n",
|
|
"4 NaN NaN 14.3 56 5 5"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 4
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"df_1.to_csv('../data/ozone_copy.csv', \n",
|
|
" encoding='utf-8', \n",
|
|
" index=False, \n",
|
|
" header=False)"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"prompt_number": 5
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"View the data directory:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"!ls -l ../data/"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"total 16\r\n",
|
|
"-rw-r--r--@ 1 dmartin 1443163707 2902 Dec 26 2012 ozone.csv\r\n",
|
|
"-rw-r--r-- 1 dmartin 1443163707 3324 Feb 14 06:40 ozone_copy.csv\r\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 6
|
|
}
|
|
],
|
|
"metadata": {}
|
|
}
|
|
]
|
|
} |