mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
359 lines
9.1 KiB
Plaintext
359 lines
9.1 KiB
Plaintext
{
|
|
"metadata": {
|
|
"name": "",
|
|
"signature": "sha256:d9028f29469f49a645af74dd1ee28614caee8ca7ca47089925560fa874a2384b"
|
|
},
|
|
"nbformat": 3,
|
|
"nbformat_minor": 0,
|
|
"worksheets": [
|
|
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"* Functions as Objects\n",
|
|
"* Lambda Functions\n",
|
|
"* Closures\n",
|
|
"* \\*args, \\*\\*kwargs\n",
|
|
"* Currying\n",
|
|
"* Generators\n",
|
|
"* Generator Expressions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Functions as Objects"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Python treats functions as objects which can simplify data cleaning. The following contains a transform utility class with two functions to clean strings:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"%%file transform_util.py\n",
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"class TransformUtil:\n",
|
|
"\n",
|
|
" @classmethod\n",
|
|
" def remove_punctuation(cls, value):\n",
|
|
" \"\"\"Removes !, #, and ?.\n",
|
|
" \"\"\" \n",
|
|
" return re.sub('[!#?]', '', value) \n",
|
|
"\n",
|
|
" @classmethod\n",
|
|
" def clean_strings(cls, strings, ops): \n",
|
|
" \"\"\"General purpose method to clean strings.\n",
|
|
"\n",
|
|
" Pass in a sequence of strings and the operations to perform.\n",
|
|
" \"\"\" \n",
|
|
" result = [] \n",
|
|
" for value in strings: \n",
|
|
" for function in ops: \n",
|
|
" value = function(value) \n",
|
|
" result.append(value) \n",
|
|
" return result"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"Overwriting transform_util.py\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 1
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Below are nose tests that exercises the utility functions:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"%%file tests/test_transform_util.py\n",
|
|
"from nose.tools import assert_equal\n",
|
|
"from ..transform_util import TransformUtil\n",
|
|
"\n",
|
|
"\n",
|
|
"class TestTransformUtil():\n",
|
|
"\n",
|
|
" states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \\\n",
|
|
" 'FlOrIda', 'south carolina##', 'West virginia?']\n",
|
|
" \n",
|
|
" expected_output = ['Alabama',\n",
|
|
" 'Georgia',\n",
|
|
" 'Georgia',\n",
|
|
" 'Georgia',\n",
|
|
" 'Florida',\n",
|
|
" 'South Carolina',\n",
|
|
" 'West Virginia']\n",
|
|
" \n",
|
|
" def test_remove_punctuation(self):\n",
|
|
" assert_equal(TransformUtil.remove_punctuation('!#?'), '')\n",
|
|
" \n",
|
|
" def test_map_remove_punctuation(self):\n",
|
|
" # Map applies a function to a collection\n",
|
|
" output = map(TransformUtil.remove_punctuation, self.states)\n",
|
|
" assert_equal('!#?' not in output, True)\n",
|
|
"\n",
|
|
" def test_clean_strings(self):\n",
|
|
" clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title] \n",
|
|
" output = TransformUtil.clean_strings(self.states, clean_ops)\n",
|
|
" assert_equal(output, self.expected_output)\n"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"Overwriting tests/test_transform_util.py\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 2
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Execute the nose tests in verbose mode:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"!nosetests tests/test_transform_util.py -v"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"core.tests.test_transform_util.TestTransformUtil.test_clean_strings ... ok\r\n",
|
|
"core.tests.test_transform_util.TestTransformUtil.test_map_remove_punctuation ... ok\r\n",
|
|
"core.tests.test_transform_util.TestTransformUtil.test_remove_punctuation ... ok\r\n",
|
|
"\r\n",
|
|
"----------------------------------------------------------------------\r\n",
|
|
"Ran 3 tests in 0.001s\r\n",
|
|
"\r\n",
|
|
"OK\r\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 3
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Lambda Functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Lambda functions are anonymous functions and are convenient for data analysis, as data transformation functions take functions as arguments."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Sort a sequence of strings by the number of letters:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"strings = ['foo', 'bar,', 'baz', 'f', 'fo', 'b', 'ba']\n",
|
|
"strings.sort(key=lambda x: len(list(x)))\n",
|
|
"strings"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 4,
|
|
"text": [
|
|
"['f', 'b', 'fo', 'ba', 'foo', 'baz', 'bar,']"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 4
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Closures"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Closures are dynamically-genearated functions returned by another function. The returned function has access to the variables in the local namespace where it was created."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Each time the following closure() is called, it generates the same output:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"def make_closure(x):\n",
|
|
" def closure():\n",
|
|
" print('Secret value is: %s' % x)\n",
|
|
" return closure\n",
|
|
"\n",
|
|
"closure = make_closure(7)\n",
|
|
"closure()"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"Secret value is: 7\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 5
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Keep track of arguments passed:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"def make_watcher():\n",
|
|
" dict_seen = {}\n",
|
|
" \n",
|
|
" def watcher(x):\n",
|
|
" if x in dict_seen:\n",
|
|
" return True\n",
|
|
" else:\n",
|
|
" dict_seen[x] = True\n",
|
|
" return False\n",
|
|
" \n",
|
|
" return watcher\n",
|
|
"\n",
|
|
"watcher = make_watcher()\n",
|
|
"seq = [1, 1, 2, 3, 5, 8, 13, 2, 5, 13]\n",
|
|
"[watcher(x) for x in seq]"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"metadata": {},
|
|
"output_type": "pyout",
|
|
"prompt_number": 6,
|
|
"text": [
|
|
"[False, True, False, False, False, False, False, True, True, True]"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 6
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## \\*args, \\*\\*kwargs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"\\*args and \\*\\*kwargs are useful when you don't know how many arguments might be passed to your function or when you want to handle named arguments that you have not defined in advance."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Print arguments and call the input function on *args:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"collapsed": false,
|
|
"input": [
|
|
"def foo(func, arg, *args, **kwargs):\n",
|
|
" print('arg: %s', arg)\n",
|
|
" print('args: %s', args)\n",
|
|
" print('kwargs: %s', kwargs)\n",
|
|
" \n",
|
|
" print('func result: %s', func(args))\n",
|
|
"\n",
|
|
"foo(sum, \"foo\", 1, 2, 3, 4, 5)"
|
|
],
|
|
"language": "python",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"stream": "stdout",
|
|
"text": [
|
|
"('arg: %s', 'foo')\n",
|
|
"('args: %s', (1, 2, 3, 4, 5))\n",
|
|
"('kwargs: %s', {})\n",
|
|
"('func result: %s', 15)\n"
|
|
]
|
|
}
|
|
],
|
|
"prompt_number": 7
|
|
}
|
|
],
|
|
"metadata": {}
|
|
}
|
|
]
|
|
} |