Updated notebook to v3.

This commit is contained in:
Donne Martin 2015-04-15 14:54:26 -04:00
parent 06fede5df4
commit 6109274ff0

View File

@ -1,12 +1,4 @@
{
"metadata": {
"name": "",
"signature": "sha256:71af98f7155af43ed0d04aeee32dfc65b677bd43e2000e52770d010c81b0095a"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
@ -40,13 +32,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!ssh -i key.pem ubuntu@ipaddress"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -57,13 +50,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!ssh -i key.pem ec2-user@ipaddress"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -85,13 +79,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!sudo apt-get install s3cmd"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -104,13 +99,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!s3cmd --configure"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -121,8 +117,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# List all buckets\n",
"!s3cmd ls\n",
"\n",
@ -167,10 +167,7 @@
"\n",
"# Apply a standard shell wildcard include to sync s3 bucket (source) to local (destination)\n",
"!s3cmd --include '2014-05-01*' sync s3://my-bucket-name/ my-local-folder-path/"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -190,14 +187,15 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!sudo apt-get install boto\n",
"!sudo apt-get install git"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -208,13 +206,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!git clone https://github.com/twpayne/s3-parallel-put.git"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -225,14 +224,15 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!export AWS_ACCESS_KEY_ID=XXX\n",
"!export AWS_SECRET_ACCESS_KEY=XXX"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -243,13 +243,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!s3-parallel-put --bucket=bucket --prefix=PREFIX SOURCE"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -260,13 +261,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!s3-parallel-put --bucket=bucket --host=s3.amazonaws.com --put=stupid --dry-run --prefix=prefix/ ./"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -286,13 +288,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!rvm --default ruby-1.8.7-p374"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -306,17 +309,18 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!./elastic-mapreduce --create --instance-group master --instance-count 1 \\\n",
"--instance-type m1.small --instance-group core --instance-count 4 \\\n",
"--instance-type m1.small --jar /home/hadoop/lib/emr-s3distcp-1.0.jar \\\n",
"--args \"--src,s3://my-bucket-source/,--groupBy,.*([0-9]{4}-01).*,\\\n",
"--dest,s3://my-bucket-dest/,--targetSize,1024\""
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -328,20 +332,21 @@
"\n",
"* Time sensitive job: Snappy or LZO\n",
"* Large amounts of data: GZIP\n",
"* General purpose: GZIP, as it\u2019s supported by most platforms\n",
"* General purpose: GZIP, as its supported by most platforms\n",
"\n",
"You can specify the compression codec (gzip, lzo, snappy, or none) to use for copied files with S3DistCp with \u2013outputCodec. If no value is specified, files are copied with no compression change. The code below sets the compression to lzo:"
"You can specify the compression codec (gzip, lzo, snappy, or none) to use for copied files with S3DistCp with outputCodec. If no value is specified, files are copied with no compression change. The code below sets the compression to lzo:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"--outputCodec,lzo"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -366,13 +371,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!python mr-script.py -r emr s3://bucket-source/ --output-dir=s3://bucket-dest/"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -383,13 +389,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!python mrjob_script.py input_data.txt > output_data.txt"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -400,8 +407,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%file mr_s3_log_parser.py\n",
"\n",
"import time\n",
@ -540,10 +551,7 @@
"\n",
"if __name__ == '__main__':\n",
" MrS3LogParser.run()"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -554,8 +562,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%file test_mr_s3_log_parser.py\n",
"\n",
"from StringIO import StringIO\n",
@ -645,10 +657,7 @@
"if __name__ == '__main__':\n",
" unittest.main()\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -659,13 +668,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!python test_mr_s3_log_parser.py -v"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -683,15 +693,16 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"copy table_name from 's3://source/part'\n",
"credentials 'aws_access_key_id=XXX;aws_secret_access_key=XXX'\n",
"csv;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -702,15 +713,16 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"copy table_name from 's3://source/part'\n",
"credentials 'aws_access_key_id=XXX;aws_secret_access_key=XXX'\n",
"csv delimiter '\\t';"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -721,13 +733,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"select * from stl_load_errors;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -738,13 +751,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"VACUUM FULL;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -755,13 +769,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"analyze compression table_name;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -772,13 +787,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cancel 18764;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -791,13 +807,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"abort;"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -815,8 +832,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"CREATE TABLE part (\n",
" p_partkey integer not null sortkey distkey,\n",
" p_name varchar(22) not null,\n",
@ -889,10 +910,7 @@
" lo_commitdate integer not null,\n",
" lo_shipmode varchar(10) not null\n",
");"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -961,13 +979,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws kinesis create-stream --stream-name Foo --shard-count 1 --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -978,13 +997,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws kinesis list-streams --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -995,13 +1015,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws kinesis describe-stream --stream-name Foo --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1012,13 +1033,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws kinesis put-record --stream-name Foo --data \"SGVsbG8sIHRoaXMgaXMgYSB0ZXN0IDEyMy4=\" --partition-key shardId-000000000000 --region us-east-1 --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1029,14 +1051,15 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!SHARD_ITERATOR=$(aws kinesis get-shard-iterator --shard-id shardId-000000000000 --shard-iterator-type TRIM_HORIZON --stream-name Foo --query 'ShardIterator' --profile adminuser)\n",
"aws kinesis get-records --shard-iterator $SHARD_ITERATOR"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1047,13 +1070,14 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws kinesis delete-stream --stream-name Foo --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1071,15 +1095,16 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda list-functions \\\n",
" --region us-east-1 \\\n",
" --max-items 10"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1090,8 +1115,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda upload-function \\\n",
" --region us-east-1 \\\n",
" --function-name foo \\\n",
@ -1101,10 +1130,7 @@
" --handler foo.handler \\\n",
" --runtime nodejs \\\n",
" --debug"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1115,17 +1141,18 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda invoke-async \\\n",
" --function-name foo \\\n",
" --region us-east-1 \\\n",
" --invoke-args foo.txt \\\n",
" --debug"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1136,16 +1163,17 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda get-function-configuration \\\n",
" --function-name helloworld \\\n",
" --region us-east-1 \\\n",
" --debug"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1156,16 +1184,17 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda get-function \\\n",
" --function-name helloworld \\\n",
" --region us-east-1 \\\n",
" --debug"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1176,8 +1205,12 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda add-event-source \\\n",
" --region us-east-1 \\\n",
" --function-name ProcessKinesisRecords \\\n",
@ -1185,10 +1218,7 @@
" --event-source kinesis-stream-arn \\\n",
" --batch-size 100 \\\n",
" --profile adminuser"
],
"language": "python",
"metadata": {},
"outputs": []
]
},
{
"cell_type": "markdown",
@ -1199,19 +1229,38 @@
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!aws lambda delete-function \\\n",
" --function-name helloworld \\\n",
" --region us-east-1 \\\n",
" --debug"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}