data-science-ipython-notebooks/core/structs.ipynb
2015-01-26 07:32:04 -05:00

1329 lines
26 KiB
Plaintext

{
"metadata": {
"name": "",
"signature": "sha256:a19ef4e563c399be8f5e56aff348924149bc045dc0b070bd846325354013e9ed"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Structures"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* tuple\n",
"* list\n",
"* dict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## tuple"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One dimensional, fixed-length, immutable sequence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"tup = (1, 2, 3)\n",
"tup"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 1,
"text": [
"(1, 2, 3)"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"list_1 = [1, 2, 3]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convert to a tuple"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"type(tuple(list_1))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"tuple"
]
}
],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nested tuples"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_tup = ([1, 2, 3], (4, 5))\n",
"nested_tup"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"([1, 2, 3], (4, 5))"
]
}
],
"prompt_number": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Access by index O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_tup[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"[1, 2, 3]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Although tuples are immutable, their contents can contain mutable objects"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_tup[0].append(4)\n",
"nested_tup[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"[1, 2, 3, 4]"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate tuples by creating a new tuple and copying objects"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"(1, 3, 2) + (4, 5, 6)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"(1, 3, 2, 4, 5, 6)"
]
}
],
"prompt_number": 7
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Multiply copies references to objects (objects themselves are not copied)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"('foo', 'bar') * 2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"('foo', 'bar', 'foo', 'bar')"
]
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Unpack tuples"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"a, b = nested_tup\n",
"a, b"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"([1, 2, 3, 4], (4, 5))"
]
}
],
"prompt_number": 9
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Unpack nested tuples"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"(a, b, c, d), (e, f) = nested_tup\n",
"a, b, c, d, e, f"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"(1, 2, 3, 4, 4, 5)"
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A common use of variable unpacking is when iterating over sequences of tuples or lists"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"seq = [( 1, 2, 3), (4, 5, 6), (7, 8, 9)] \n",
"for a, b, c in seq: \n",
" print(a, b, c)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(1, 2, 3)\n",
"(4, 5, 6)\n",
"(7, 8, 9)\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One dimensional, variable-length, mutable sequence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"list_1 = [1, 2, 3]\n",
"list_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"[1, 2, 3]"
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convert to a list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"type(list(tup))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"list"
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nested list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list = [(1, 2, 3), [4, 5]]\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"[(1, 2, 3), [4, 5]]"
]
}
],
"prompt_number": 14
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Access by index"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list[1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"[4, 5]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Append an element O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list.append(6)\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"[(1, 2, 3), [4, 5], 6]"
]
}
],
"prompt_number": 16
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Insert an element at a specific index. Insert is expensive as it has to shift subsequent elements O(n)."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list.insert(0, 'start')\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 17,
"text": [
"['start', (1, 2, 3), [4, 5], 6]"
]
}
],
"prompt_number": 17
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pop removes and returns an element from a specified index. Pop is expensive as it has to shift subsequent elements O(n). O(1) if pop is used for the last element"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list.pop(0)\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"[(1, 2, 3), [4, 5], 6]"
]
}
],
"prompt_number": 18
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove locates the first such value and removes it O(n)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list.remove((1, 2, 3))\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 19,
"text": [
"[[4, 5], 6]"
]
}
],
"prompt_number": 19
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if a list contains a value O(n)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"6 in nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"True"
]
}
],
"prompt_number": 20
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate lists by creating a new list and copying objects"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"[1, 3, 2] + [4, 5, 6]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 21,
"text": [
"[1, 3, 2, 4, 5, 6]"
]
}
],
"prompt_number": 21
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Extend a list by appending elements. Faster than concatenating lists."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nested_list.extend([7, 8, 9])\n",
"nested_list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 22,
"text": [
"[[4, 5], 6, 7, 8, 9]"
]
}
],
"prompt_number": 22
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## dict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Also known as a hash map or associative array. A dict is a mutable collection of key-value pairs.\n",
"\n",
"Big O complexities are listed as average case, with most worst case complexities being O(n)."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1 = { 'a' : 'foo', 'b' : [0, 1, 2, 3] }\n",
"dict_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 23,
"text": [
"{'a': 'foo', 'b': [0, 1, 2, 3]}"
]
}
],
"prompt_number": 23
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Access by index O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1['b']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 24,
"text": [
"[0, 1, 2, 3]"
]
}
],
"prompt_number": 24
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Insert or set by index O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1[5] = 'bar'\n",
"dict_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 25,
"text": [
"{5: 'bar', 'a': 'foo', 'b': [0, 1, 2, 3]}"
]
}
],
"prompt_number": 25
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if a dict contains a key O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"5 in dict_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
"True"
]
}
],
"prompt_number": 26
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete a value O(1)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_2 = dict(dict_1)\n",
"del dict_2[5]\n",
"dict_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 27,
"text": [
"{'a': 'foo', 'b': [0, 1, 2, 3]}"
]
}
],
"prompt_number": 27
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pop a value O(1) deletes the key and returns the value"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"value = dict_2.pop('b')\n",
"print(value)\n",
"print(dict_2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[0, 1, 2, 3]\n",
"{'a': 'foo'}\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get or pop with a default value if the key is not found. By default, get() will return None and pop() will throw an exception if the key is not found."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"value = dict_1.get('z', 0)\n",
"value"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 29,
"text": [
"0"
]
}
],
"prompt_number": 29
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"setdefault() is similar to get(), but returns a default value if the key is not found"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(dict_1.setdefault('b', None))\n",
"print(dict_1.setdefault('z', None))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[0, 1, 2, 3]\n",
"None\n"
]
}
],
"prompt_number": 30
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By contrast to setdefault(), defaultdict lets you specify the default when the container is initialized, which works well if the default is appropriate for all keys."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import defaultdict\n",
"\n",
"seq = ['foo', 'bar', 'baz']\n",
"first_letter = defaultdict(list)\n",
"for elem in seq:\n",
" first_letter[elem[0]].append(elem)\n",
"first_letter"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 31,
"text": [
"defaultdict(<type 'list'>, {'b': ['bar', 'baz'], 'f': ['foo']})"
]
}
],
"prompt_number": 31
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"dict keys must be \"hashable\": immutable objects like scalars (int, float, string) or tuples whose objects are all immutable."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(hash('string'))\n",
"print(hash((1, 2, (3, 4))))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"-9167918882415130555\n",
"-2725224101759650258\n"
]
}
],
"prompt_number": 32
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lists are mutable and therefore are not hashable, although you can convert the list portion to a tuple as a quick fix"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hash((1, 2, [3, 4]))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "unhashable type: 'list'",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-33-94f25bbf31b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhash\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
]
}
],
"prompt_number": 33
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the list of keys in no particular order (although keys() outputs the keys in the same order). In Python 3, keys() returns an iterator instead of a list."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1.keys()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 34,
"text": [
"['a', 'b', 5, 'z']"
]
}
],
"prompt_number": 34
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the list of values in no particular order (although values() outputs the keys in the same order). In Python 3, keys() returns an iterator instead of a list."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1.values()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 35,
"text": [
"['foo', [0, 1, 2, 3], 'bar', None]"
]
}
],
"prompt_number": 35
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Merge one dict into another"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dict_1.update({'e' : 'elephant', 'f' : 'fish'})\n",
"dict_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 36,
"text": [
"{5: 'bar',\n",
" 'a': 'foo',\n",
" 'b': [0, 1, 2, 3],\n",
" 'e': 'elephant',\n",
" 'f': 'fish',\n",
" 'z': None}"
]
}
],
"prompt_number": 36
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A common operation is to pair up two sequences element-wise in a dict"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mapping = dict(zip(range(7), reversed(range(7))))\n",
"mapping"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 37,
"text": [
"{0: 6, 1: 5, 2: 4, 3: 3, 4: 2, 5: 1, 6: 0}"
]
}
],
"prompt_number": 37
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## set"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A set is an unordered sequence of unique elements. "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_1 = set([0, 1, 2, 3, 4, 5])\n",
"set_1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 38,
"text": [
"{0, 1, 2, 3, 4, 5}"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_2 = {1, 2, 3, 5, 8, 13}\n",
"set_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 39,
"text": [
"{1, 2, 3, 5, 8, 13}"
]
}
],
"prompt_number": 39
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sets support set operations like union, intersection, difference, and symmetric difference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Union O(len(set_1) + len(set_2))"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_1 | set_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 40,
"text": [
"{0, 1, 2, 3, 4, 5, 8, 13}"
]
}
],
"prompt_number": 40
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Intersection O(min(len(set_1), len(set_2))"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_1 & set_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 41,
"text": [
"{1, 2, 3, 5}"
]
}
],
"prompt_number": 41
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Difference O(len(set_1))"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_1 - set_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 42,
"text": [
"{0, 4}"
]
}
],
"prompt_number": 42
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Symmetric Difference O(len(set_1))"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_1 ^ set_2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 43,
"text": [
"{0, 4, 8, 13}"
]
}
],
"prompt_number": 43
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Subset"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_3 = {1, 2, 3}\n",
"set_3.issubset(set_2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 44,
"text": [
"True"
]
}
],
"prompt_number": 44
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Superset"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set_2.issuperset(set_3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 45,
"text": [
"True"
]
}
],
"prompt_number": 45
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Equal"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"{1, 2, 3} == {3, 2, 1}"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 46,
"text": [
"True"
]
}
],
"prompt_number": 46
}
],
"metadata": {}
}
]
}