AWS Glue for ETL (extract, transform and load) processes

(Image taken from: https://aws.amazon.com/glue/)

Create required resources

// resources.tfprovider "aws" {
region = "us-east-1"
}

resource "aws_iam_role" "aws_iam_glue_role" {
name = "AWSGlueServiceRoleDefault"

assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}

resource "aws_iam_role_policy_attachment" "glue_service_attachment" {
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
role = aws_iam_role.aws_iam_glue_role.id
}

resource "aws_iam_role_policy" "s3_policy" {
name = "s3_policy"
role = aws_iam_role.aws_iam_glue_role.id

policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:*"
],
"Resource": [
"arn:aws:s3:::${var.bucket_for_glue}",
"arn:aws:s3:::${var.bucket_for_glue}/*"
]
}
]
}
EOF
}

resource "aws_iam_role_policy" "glue_service_s3" {
name = "glue_service_s3"
role = aws_iam_role.aws_iam_glue_role.id
policy = aws_iam_role_policy.s3_policy.policy
}
variable "bucket_for_glue" {
description = "Bucket for AWS Glue..."
default = "aws-glue-etl-process"
}
resource "aws_s3_bucket" "bucket_for_glue" {
bucket = var.bucket_for_glue
force_destroy = true
}
$ terraform init
$ terraform apply
$ aws s3 cp ./nginx_logs.txt s3://{bucket}/data/raw/nginx_logs.txt

Using Custom Classifiers

"Ipad mini",EU,50.0,2
"Lenovo Ideapad",AU,350.0,1
"Huawei Y9 2019",UE,120.0,2
"MSI",LATAM,500.0,6
"Samsung 27-VA",CA,50.0,3
$ aws s3 cp ./custom_data.csv s3://{bucket}/sales/custom_data.csv
resource "aws_glue_catalog_database" "sales_database" {
name = "sales"
}

resource "aws_glue_classifier" "aws_glue_csv_classifier" {
name = "csv-classifier"

csv_classifier {
header = ["PRODUCT_NAME", "COUNTRY", "PRICE", "QUANTITY"]
contains_header = "ABSENT"
quote_symbol = "\""
delimiter = ","
}
}

resource "aws_glue_crawler" "aws_glue_custom_csv_crawler" {
name = "custom-csv-crawler"
database_name = aws_glue_catalog_database.sales_database.name
classifiers = [aws_glue_classifier.aws_glue_csv_classifier.id]
role = aws_iam_role.aws_iam_glue_role.arn

s3_target {
path = "s3://${aws_s3_bucket.bucket_for_glue.bucket}/sales/"
}
}
$ terraform destroy

--

--

--

Cloud & Solutions & Data Architect | Python Developer | Serverless Advocate

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Ruby methods to make your life simpler

Top 5 Ways To Purchase A Made use of System.

Microservices Architecture

A Beginners Guide to Docker

Agile Testing: from Feature to E2E

A quick guide to publishing Python Package on PyPI

Using Pandas to implement Stop Losses and Take Profits for trading strategy backtests

How I managed to encounter and recover from Computer Science’s Two Hardest Problems

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Alejandro Cora González

Alejandro Cora González

Cloud & Solutions & Data Architect | Python Developer | Serverless Advocate

More from Medium

Resiliency and Chaos Engineering — Part 3

Cloud Computing Vs On-Premises

Cost

Regulating for Data Protection: GDPR

About privacy and Data Brokers