mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
577 lines
11 KiB
Plaintext
577 lines
11 KiB
Plaintext
|
{
|
||
|
"metadata": {
|
||
|
"name": "",
|
||
|
"signature": "sha256:5af6c8db3042b9d07306a075e560855c3bd9a73234feb466482830d025b58068"
|
||
|
},
|
||
|
"nbformat": 3,
|
||
|
"nbformat_minor": 0,
|
||
|
"worksheets": [
|
||
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Pandas"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Basics"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"from pandas import Series, DataFrame\n",
|
||
|
"import pandas as pd"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"prompt_number": 1
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Series\n",
|
||
|
"\n",
|
||
|
"A Series is a one-dimensional array-like object containing an array of data and an associated array of data labels. The data can be any NumPy data type and the labels are the Series' index."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Create a Series:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_1 = Series([1, 1, 2, -3, -5, 8, 13])\n",
|
||
|
"ser_1"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 2,
|
||
|
"text": [
|
||
|
"0 1\n",
|
||
|
"1 1\n",
|
||
|
"2 2\n",
|
||
|
"3 -3\n",
|
||
|
"4 -5\n",
|
||
|
"5 8\n",
|
||
|
"6 13\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 2
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Get the array representation of a Series:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_1.values"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 3,
|
||
|
"text": [
|
||
|
"array([ 1, 1, 2, -3, -5, 8, 13])"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 3
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Get the index of the Series:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_1.index"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 4,
|
||
|
"text": [
|
||
|
"Int64Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 4
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Create a Series with a custom index:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_2 = Series([1, 1, 2, -3, -5], index=['a', 'b', 'c', 'd', 'e'])\n",
|
||
|
"ser_2"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 5,
|
||
|
"text": [
|
||
|
"a 1\n",
|
||
|
"b 1\n",
|
||
|
"c 2\n",
|
||
|
"d -3\n",
|
||
|
"e -5\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 5
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Get a value from a Series:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_2[4] == ser_2['e']"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 6,
|
||
|
"text": [
|
||
|
"True"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 6
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Get a set of values from a Series by passing in a list:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_2[['c', 'a', 'b']]"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 7,
|
||
|
"text": [
|
||
|
"c 2\n",
|
||
|
"a 1\n",
|
||
|
"b 1\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 7
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Get values great than 0:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_2[ser_2 > 0]"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 8,
|
||
|
"text": [
|
||
|
"a 1\n",
|
||
|
"b 1\n",
|
||
|
"c 2\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 8
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Scalar multiply:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_2 * 2"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 9,
|
||
|
"text": [
|
||
|
"a 2\n",
|
||
|
"b 2\n",
|
||
|
"c 4\n",
|
||
|
"d -6\n",
|
||
|
"e -10\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 9
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Apply a numpy math function:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"import numpy as np\n",
|
||
|
"np.exp(ser_2)"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 10,
|
||
|
"text": [
|
||
|
"a 2.718282\n",
|
||
|
"b 2.718282\n",
|
||
|
"c 7.389056\n",
|
||
|
"d 0.049787\n",
|
||
|
"e 0.006738\n",
|
||
|
"dtype: float64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 10
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"A Series is like a fixed-length, ordered dict. \n",
|
||
|
"\n",
|
||
|
"Create a series by passing in a dict:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"dict_1 = {'foo' : 100, 'bar' : 200, 'baz' : 300}\n",
|
||
|
"ser_3 = Series(dict_1)\n",
|
||
|
"ser_3"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 11,
|
||
|
"text": [
|
||
|
"bar 200\n",
|
||
|
"baz 300\n",
|
||
|
"foo 100\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 11
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Re-order a Series by passing in an index (indices not found are NaN):"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"index = ['foo', 'bar', 'baz', 'qux']\n",
|
||
|
"ser_4 = Series(dict_1, index=index)\n",
|
||
|
"ser_4"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 12,
|
||
|
"text": [
|
||
|
"foo 100\n",
|
||
|
"bar 200\n",
|
||
|
"baz 300\n",
|
||
|
"qux NaN\n",
|
||
|
"dtype: float64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 12
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Check for NaN with the pandas method:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"pd.isnull(ser_4)"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 13,
|
||
|
"text": [
|
||
|
"foo False\n",
|
||
|
"bar False\n",
|
||
|
"baz False\n",
|
||
|
"qux True\n",
|
||
|
"dtype: bool"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 13
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Check for NaN with the Series method:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_4.isnull()"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 14,
|
||
|
"text": [
|
||
|
"foo False\n",
|
||
|
"bar False\n",
|
||
|
"baz False\n",
|
||
|
"qux True\n",
|
||
|
"dtype: bool"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 14
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Series automatically aligns differently indexed data in arithmetic operations:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_3 + ser_4"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 15,
|
||
|
"text": [
|
||
|
"bar 400\n",
|
||
|
"baz 600\n",
|
||
|
"foo 200\n",
|
||
|
"qux NaN\n",
|
||
|
"dtype: float64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 15
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Name a Series:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_4.name = 'foobarbazqux'"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"prompt_number": 16
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Name a Series index:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_4.index.name = 'label'"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"prompt_number": 17
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_4"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 18,
|
||
|
"text": [
|
||
|
"label\n",
|
||
|
"foo 100\n",
|
||
|
"bar 200\n",
|
||
|
"baz 300\n",
|
||
|
"qux NaN\n",
|
||
|
"Name: foobarbazqux, dtype: float64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 18
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Rename a Series' index in place:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"collapsed": false,
|
||
|
"input": [
|
||
|
"ser_4.index = ['fo', 'br', 'bz', 'qx']\n",
|
||
|
"ser_4"
|
||
|
],
|
||
|
"language": "python",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"output_type": "pyout",
|
||
|
"prompt_number": 19,
|
||
|
"text": [
|
||
|
"fo 100\n",
|
||
|
"br 200\n",
|
||
|
"bz 300\n",
|
||
|
"qx NaN\n",
|
||
|
"Name: foobarbazqux, dtype: float64"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"prompt_number": 19
|
||
|
}
|
||
|
],
|
||
|
"metadata": {}
|
||
|
}
|
||
|
]
|
||
|
}
|