mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
Added snippets for basic RDD operations.
This commit is contained in:
parent
6c7e7b5239
commit
e8b481f480
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"metadata": {
|
||||
"name": "",
|
||||
"signature": "sha256:91816797cb89d9542e7443d05f797ce60e70a74be3a870823401ce7b3e7c3313"
|
||||
"signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
|
||||
},
|
||||
"nbformat": 3,
|
||||
"nbformat_minor": 0,
|
||||
|
@ -14,7 +14,8 @@
|
|||
"source": [
|
||||
"# Spark\n",
|
||||
"\n",
|
||||
"* Python Shell"
|
||||
"* Python Shell\n",
|
||||
"* RDDs"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -57,6 +58,169 @@
|
|||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## RDDs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create an RDD from the contents of a directory:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"my_data = sc.textFile(\"file:/path/*\")"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Count the number of lines in the data:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"my_data.count()"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display the data in the data:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"my_data.collect()"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Return the first 10 lines in the data:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"my_data.take(10)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create an RDD with lines matching the given filter:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"my_data.filter(lambda line: \".txt\" in line)"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Chain a series of commands:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"sc.textFile(\"file:/path/file.txt\") \\\n",
|
||||
" .filter(lambda line: \".txt\" in line) \\\n",
|
||||
" .count()"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"first_words = my_data.map(lambda line: line.split()[0])"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Output each word in first_words:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"for word in first_words.take(10):\n",
|
||||
" print word"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Save the first words to a text file:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"first_words.saveAsTextFile(\"file:/path/file\")"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": []
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user