406. AWS Glue Crawler API Calls - qyjohn/AWS_Tutorials GitHub Wiki

import boto3
import uuid
import time
from datetime import datetime, timedelta

crawlerName   = "TestCrawler-" + str(uuid.uuid1())
crawlerRole   = "arn:aws:iam::xxxxxxxxxxxx:role/Bacis_Glue_Role"
databaseName  = "test"
s3DataPath    = "s3://bucket-name/data/"
crawlerTarget ={
        'S3Targets': [
            {
                'Path': s3DataPath
            }
        ]}
client = boto3.client('glue')

# 
# 1. Create a crawler.
#
response = client.create_crawler(
        Name = crawlerName,
        Role = crawlerRole,
        DatabaseName = databaseName,
        Targets = crawlerTarget,
        TablePrefix = crawlerName
)
state = response['ResponseMetadata']['HTTPStatusCode']
if (state == 200):
        print 'The create_crawler API call return 200 OK'

#
# 2. Check crawler state, ensure it is ready.
#
state = 'TEST'
while (state != 'READY'):
        response = client.get_crawler(Name = crawlerName)
        state = response['Crawler']['State']
        print state
        time.sleep(0.05)

#
# 3. Start the crawler and wait for it to finish running.
#
start_time = datetime.now().replace(tzinfo=None)
client.start_crawler(Name = crawlerName)
state = 'TEST'
while (state != 'READY'):
        response = client.get_crawler(Name = crawlerName)
        state = response['Crawler']['State']
        print state
        if (state != 'READY'):
                time.sleep(10)

#
# 4. Get crawler metrics
#
response = client.get_crawler_metrics(CrawlerNameList = [crawlerName])
metrics = response['CrawlerMetricsList']
print metrics

#
# 5. Check new tables in the database
#
response = client.get_tables(DatabaseName = databaseName)
tables = response['TableList']
for table in tables:
        tableName = table['Name']
        create_time = table['CreateTime'].replace(tzinfo=None)
        update_time = table['UpdateTime'].replace(tzinfo=None)
        if (start_time < create_time):
                print 'Created Table: ' + tableName
        elif (start_time < update_time):
                print 'Updated Table: ' + tableName
#
# 5.1 Delete the test tables.
#
                client.delete_table(DatabaseName = databaseName, Name = tableName)

#
# 6. Delete the crawler.
#
response = client.delete_crawler(Name = crawlerName)
state = response['ResponseMetadata']['HTTPStatusCode']
if (state == 200):
        print 'The delete_crawler API call return 200 OK'
#
# 7. Double check if the crawler is still there.
# You should get an error here. Handle the error in your code
#
response = client.get_crawler(Name = crawlerName)
print response