mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
819 lines
16 KiB
Plaintext
819 lines
16 KiB
Plaintext
{
|
|
"metadata": {
|
|
"name": "",
|
|
"signature": "sha256:f24f6112f8e8e28d262a4f84be84d1ae459154a9a445687d55392ff177e6a03c"
|
|
},
|
|
"nbformat": 3,
|
|
"nbformat_minor": 0,
|
|
"worksheets": [
|
|
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Data Structures"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## tuple"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# One dimensional, fixed-length, immutable sequence\n",
|
|
"tup = (1, 2, 3)\n",
|
|
"tup"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 1,
|
|
"text": [
|
|
"(1, 2, 3)"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 1
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"a_list = [1, 2, 3]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"prompt_number": 2
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Convert to a tuple\n",
|
|
"type(tuple(a_list))"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 3,
|
|
"text": [
|
|
"tuple"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 3
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Nested tuples\n",
|
|
"nested_tup = ([1, 2, 3], (4, 5))\n",
|
|
"nested_tup"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 4,
|
|
"text": [
|
|
"([1, 2, 3], (4, 5))"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 4
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Access by index O(1)\n",
|
|
"nested_tup[0]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 5,
|
|
"text": [
|
|
"[1, 2, 3]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 5
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Although tuples are immutable, their contents can contain mutable objects\n",
|
|
"nested_tup[0].append(4)\n",
|
|
"nested_tup[0]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 6,
|
|
"text": [
|
|
"[1, 2, 3, 4]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 6
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Concatenate tuples\n",
|
|
"# Creates a new tuple and copies objects\n",
|
|
"(1, 3, 2) + (4, 5, 6)"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 7,
|
|
"text": [
|
|
"(1, 3, 2, 4, 5, 6)"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 7
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Multiply copies references to objects (objects themselves are not copied)\n",
|
|
"('foo', 'bar') * 2"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 8,
|
|
"text": [
|
|
"('foo', 'bar', 'foo', 'bar')"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 8
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Unpack tuples\n",
|
|
"a, b = nested_tup\n",
|
|
"a, b"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 9,
|
|
"text": [
|
|
"([1, 2, 3, 4], (4, 5))"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 9
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Unpack nested tuples\n",
|
|
"(a, b, c, d), (e, f) = nested_tup\n",
|
|
"a, b, c, d, e, f"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 10,
|
|
"text": [
|
|
"(1, 2, 3, 4, 4, 5)"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 10
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# A common use of variable unpacking is when iterating over sequences\n",
|
|
"# of tuples or lists\n",
|
|
"seq = [( 1, 2, 3), (4, 5, 6), (7, 8, 9)] \n",
|
|
"for a, b, c in seq: \n",
|
|
" print(a, b, c)"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"(1, 2, 3)\n",
|
|
"(4, 5, 6)\n",
|
|
"(7, 8, 9)\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 11
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## list"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# One dimensional, variable-length, mutable sequence\n",
|
|
"a_list = [1, 2, 3]\n",
|
|
"a_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 12,
|
|
"text": [
|
|
"[1, 2, 3]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 12
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Convert to a list\n",
|
|
"type(list(tup))"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 13,
|
|
"text": [
|
|
"list"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 13
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Nested list\n",
|
|
"nested_list = [(1, 2, 3), [4, 5]]\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 14,
|
|
"text": [
|
|
"[(1, 2, 3), [4, 5]]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 14
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Access by index\n",
|
|
"nested_list[1]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 15,
|
|
"text": [
|
|
"[4, 5]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 15
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Append an element O(1)\n",
|
|
"nested_list.append(6)\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 16,
|
|
"text": [
|
|
"[(1, 2, 3), [4, 5], 6]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 16
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Insert an element at a specific index\n",
|
|
"# Insert is expensive as it has to shift subsequent elements O(n)\n",
|
|
"nested_list.insert(0, 'start')\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 17,
|
|
"text": [
|
|
"['start', (1, 2, 3), [4, 5], 6]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 17
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Pop removes and returns an element from a specified index\n",
|
|
"# Pop is expensive as it has to shift subsequent elements O(n)\n",
|
|
"# O(1) if pop is used for the last element\n",
|
|
"nested_list.pop(0)\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 18,
|
|
"text": [
|
|
"[(1, 2, 3), [4, 5], 6]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 18
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Remove locates the first such value and removes it O(n)\n",
|
|
"nested_list.remove((1, 2, 3))\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 19,
|
|
"text": [
|
|
"[[4, 5], 6]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 19
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Check if a list contains a value O(n)\n",
|
|
"6 in nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 20,
|
|
"text": [
|
|
"True"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 20
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Concatenate lists\n",
|
|
"# Creates a new list and copies objects\n",
|
|
"[1, 3, 2] + [4, 5, 6]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 21,
|
|
"text": [
|
|
"[1, 3, 2, 4, 5, 6]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 21
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Extend a list by appending elements\n",
|
|
"# Faster than concatenating lists\n",
|
|
"nested_list.extend([7, 8, 9])\n",
|
|
"nested_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 22,
|
|
"text": [
|
|
"[[4, 5], 6, 7, 8, 9]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 22
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## sort"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Sort in-place O(n log n)\n",
|
|
"a_list = [1, 5, 3, 9, 7, 6]\n",
|
|
"a_list.sort()\n",
|
|
"a_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 23,
|
|
"text": [
|
|
"[1, 3, 5, 6, 7, 9]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 23
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Sort by secondary key: str length\n",
|
|
"b_list = ['the', 'quick', 'brown', 'fox', 'jumps', 'over']\n",
|
|
"b_list.sort(key=len)\n",
|
|
"b_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 24,
|
|
"text": [
|
|
"['the', 'fox', 'over', 'quick', 'brown', 'jumps']"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 24
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## bisect"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# The bisect module does not check whether the list is sorted, as this check\n",
|
|
"# would be expensive O(n). Using bisect on an unsorted list will not result\n",
|
|
"# in an error but could lead to incorrect results.\n",
|
|
"import bisect\n",
|
|
"\n",
|
|
"# Find the location where an element should be inserted to keep the\n",
|
|
"# list sorted\n",
|
|
"c_list = [1, 2, 2, 3, 5, 13]\n",
|
|
"bisect.bisect(c_list, 8)"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 25,
|
|
"text": [
|
|
"5"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 25
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Inserts an element into a location to keep the list sorted\n",
|
|
"bisect.insort(c_list, 8)\n",
|
|
"c_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 26,
|
|
"text": [
|
|
"[1, 2, 2, 3, 5, 8, 13]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 26
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## slice"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"![alt text](http://www.nltk.org/images/string-slicing.png)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Select a section of list types (arrays, tuples, NumPy arrays)\n",
|
|
"# start:stop\n",
|
|
"# start is included, stop is not\n",
|
|
"# number of elements in the result is stop - start\n",
|
|
"d_list = 'Monty Python'\n",
|
|
"d_list[6:10]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 27,
|
|
"text": [
|
|
"'Pyth'"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 27
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Omit start to default to start of the sequence\n",
|
|
"d_list[:5]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 28,
|
|
"text": [
|
|
"'Monty'"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 28
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Omit end to default to end of the sequence\n",
|
|
"d_list[6:]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 29,
|
|
"text": [
|
|
"'Python'"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 29
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Negative indices slice relative to the end\n",
|
|
"d_list[-12:-7]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 30,
|
|
"text": [
|
|
"'Monty'"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 30
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Slice can also take a step such as the one below, which takes\n",
|
|
"# every other element\n",
|
|
"e_list[::2]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 33,
|
|
"text": [
|
|
"[1, 2, 5, 13]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 33
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Passing -1 for the step reverses the list or tuple:\n",
|
|
"e_list[::-1]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 34,
|
|
"text": [
|
|
"[13, ['H', 'a', 'l', 'l'], 5, 3, 2, 1, 1]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 34
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Assign elements to a slice\n",
|
|
"# Slice range does not have to equal number of elements to assign\n",
|
|
"e_list = [1, 1, 2, 3, 5, 8, 13]\n",
|
|
"e_list[5:] = ['H', 'a', 'l', 'l']\n",
|
|
"e_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 31,
|
|
"text": [
|
|
"[1, 1, 2, 3, 5, 'H', 'a', 'l', 'l']"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 31
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"# Compare assigning into a slice (above) versus assigning into\n",
|
|
"# an inde\n",
|
|
"e_list"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 32,
|
|
"text": [
|
|
"[1, 1, 2, 3, 5, ['H', 'a', 'l', 'l'], 13]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 32
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": []
|
|
}
|
|
],
|
|
"metadata": {}
|
|
}
|
|
]
|
|
} |