406. AWS Glue Crawler API Calls - qyjohn/AWS_Tutorials GitHub Wiki
import boto3
import uuid
import time
from datetime import datetime, timedelta
crawlerName = "TestCrawler-" + str(uuid.uuid1())
crawlerRole = "arn:aws:iam::xxxxxxxxxxxx:role/Bacis_Glue_Role"
databaseName = "test"
s3DataPath = "s3://bucket-name/data/"
crawlerTarget ={
'S3Targets': [
{
'Path': s3DataPath
}
]}
client = boto3.client('glue')
#
# 1. Create a crawler.
#
response = client.create_crawler(
Name = crawlerName,
Role = crawlerRole,
DatabaseName = databaseName,
Targets = crawlerTarget,
TablePrefix = crawlerName
)
state = response['ResponseMetadata']['HTTPStatusCode']
if (state == 200):
print 'The create_crawler API call return 200 OK'
#
# 2. Check crawler state, ensure it is ready.
#
state = 'TEST'
while (state != 'READY'):
response = client.get_crawler(Name = crawlerName)
state = response['Crawler']['State']
print state
time.sleep(0.05)
#
# 3. Start the crawler and wait for it to finish running.
#
start_time = datetime.now().replace(tzinfo=None)
client.start_crawler(Name = crawlerName)
state = 'TEST'
while (state != 'READY'):
response = client.get_crawler(Name = crawlerName)
state = response['Crawler']['State']
print state
if (state != 'READY'):
time.sleep(10)
#
# 4. Get crawler metrics
#
response = client.get_crawler_metrics(CrawlerNameList = [crawlerName])
metrics = response['CrawlerMetricsList']
print metrics
#
# 5. Check new tables in the database
#
response = client.get_tables(DatabaseName = databaseName)
tables = response['TableList']
for table in tables:
tableName = table['Name']
create_time = table['CreateTime'].replace(tzinfo=None)
update_time = table['UpdateTime'].replace(tzinfo=None)
if (start_time < create_time):
print 'Created Table: ' + tableName
elif (start_time < update_time):
print 'Updated Table: ' + tableName
#
# 5.1 Delete the test tables.
#
client.delete_table(DatabaseName = databaseName, Name = tableName)
#
# 6. Delete the crawler.
#
response = client.delete_crawler(Name = crawlerName)
state = response['ResponseMetadata']['HTTPStatusCode']
if (state == 200):
print 'The delete_crawler API call return 200 OK'
#
# 7. Double check if the crawler is still there.
# You should get an error here. Handle the error in your code
#
response = client.get_crawler(Name = crawlerName)
print response