2015-03-03 23:32:59 +08:00
|
|
|
{
|
|
|
|
"metadata": {
|
|
|
|
"name": "",
|
2015-03-03 23:36:30 +08:00
|
|
|
"signature": "sha256:6de81fb20a6c3dee884019beb8604ace32cb3be3e7d63d3547dd433075d62e69"
|
2015-03-03 23:32:59 +08:00
|
|
|
},
|
|
|
|
"nbformat": 3,
|
|
|
|
"nbformat_minor": 0,
|
|
|
|
"worksheets": [
|
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"# Spark\n",
|
|
|
|
"\n",
|
2015-03-03 23:36:30 +08:00
|
|
|
"* Python Shell\n",
|
|
|
|
"* RDDs"
|
2015-03-03 23:32:59 +08:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Python Shell"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Start the pyspark shell:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"pyspark"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"View the spark context:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"sc"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
2015-03-03 23:36:30 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## RDDs"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Create an RDD from the contents of a directory:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"my_data = sc.textFile(\"file:/path/*\")"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Count the number of lines in the data:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"my_data.count()"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Display the data in the data:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"my_data.collect()"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Return the first 10 lines in the data:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"my_data.take(10)"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Create an RDD with lines matching the given filter:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"my_data.filter(lambda line: \".txt\" in line)"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Chain a series of commands:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"sc.textFile(\"file:/path/file.txt\") \\\n",
|
|
|
|
" .filter(lambda line: \".txt\" in line) \\\n",
|
|
|
|
" .count()"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Create a new RDD mapping each line to an array of words, taking only the first word of each array:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"first_words = my_data.map(lambda line: line.split()[0])"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Output each word in first_words:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"for word in first_words.take(10):\n",
|
|
|
|
" print word"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Save the first words to a text file:"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"collapsed": false,
|
|
|
|
"input": [
|
|
|
|
"first_words.saveAsTextFile(\"file:/path/file\")"
|
|
|
|
],
|
|
|
|
"language": "python",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": []
|
2015-03-03 23:32:59 +08:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|