304. TPC H Tests - qyjohn/AWS_Tutorials GitHub Wiki

(1) Download and Install

Visit the TPC website to download the TPC-H benchmark tool. You will get an email message with instructions to download a zip file.

Launch an EC2 instance with Amazon Linux. Install gcc and make:

sudo yum install gcc make

Unzip the zip file, then go into this particular folder dbgen. Create a Makefile from the template.

cp makefile.suite makefile

Edit the above-mentioned makefile, with the following changes:

CC = gcc
DATABASE = ORACLE
MACHINE = LINUX
WORKLOAD = TPCH

Compile the utility:

make

(2) Data Generation

In the above-mentioned dbgen folder, run the following command to generate test data. Here 1 means generating 1 GB of test data. If you want to generate 1 TB data, use 1024 instead of 1. (Of course you need to have sufficient storage space on the EC2 instance. We recommend that you do this on an i2.8xlarge instance with 8x800 GB instance-store volumes. Create a RAID0 device out of the 8 instance-store volumes to do the work.)

./dbgen -vf -s 1

After a while, you should have the following data files in the same folder:

2018-08-29 22:39:11   24346144 customer.tbl
2018-08-29 22:39:19  759863287 lineitem.tbl
2018-08-29 22:39:32       2224 nation.tbl
2018-08-29 22:39:39  171952161 orders.tbl
2018-08-29 22:40:21   24135125 part.tbl
2018-08-29 22:40:02  118984616 partsupp.tbl
2018-08-29 22:40:31        389 region.tbl
2018-08-29 22:40:38    1409184 supplier.tbl

Use the following script (upload.sh) to upload these files to your S3 bucket:

#!/bin/bash
aws s3 cp $1.tbl s3://<your-bucket-name>/$1/$1.tbl

For example:

chmod +x upload.sh
./upload.sh customer
./upload.sh lineitem

(3) Table Creation

In Athena, use the following queries to create the necessary tables:

CREATE EXTERNAL TABLE PART 
(P_PARTKEY INT, P_NAME VARCHAR(55), P_MFGR VARCHAR(25), P_BRAND VARCHAR(10), P_TYPE VARCHAR(25), P_SIZE INT, P_CONTAINER VARCHAR(10), P_RETAILPRICE DECIMAL, P_COMMENT VARCHAR(23)) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' 
LOCATION 's3://<your-bucket-name>/part/';

CREATE EXTERNAL TABLE SUPPLIER 
(S_SUPPKEY INT, S_NAME CHAR(25), S_ADDRESS VARCHAR(40), S_NATIONKEY BIGINT, S_PHONE CHAR(15), S_ACCTBAL DECIMAL, S_COMMENT VARCHAR(101)) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' 
LOCATION 's3://<your-bucket-name>/supplier/';

CREATE EXTERNAL TABLE PARTSUPP 
(PS_PARTKEY BIGINT, PS_SUPPKEY BIGINT, PS_AVAILQTY INT, PS_SUPPLYCOST DECIMAL, PS_COMMENT VARCHAR(199))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/partsupp/';

CREATE EXTERNAL TABLE CUSTOMER 
(C_CUSTKEY INT, C_NAME VARCHAR(25), C_ADDRESS VARCHAR(40), C_NATIONKEY BIGINT, C_PHONE CHAR(15), C_ACCTBAL DECIMAL, C_MKTSEGMENT CHAR(10), C_COMMENT VARCHAR(117))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/customer/';

CREATE EXTERNAL TABLE ORDERS
(O_ORDERKEY INT, O_CUSTKEY BIGINT, O_ORDERSTATUS CHAR(1), O_TOTALPRICE DECIMAL, O_ORDERDATE DATE, O_ORDERPRIORITY CHAR(15), O_CLERK CHAR(15), O_SHIPPRIORITY INT, O_COMMENT VARCHAR(79))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/orders/';

CREATE EXTERNAL TABLE LINEITEM 
(L_ORDERKEY BIGINT, L_PARTKEY BIGINT, L_SUPPKEY BIGINT, L_LINENUMBER INT, L_QUANTITY DECIMAL, L_EXTENDEDPRICE DECIMAL, L_DISCOUNT DECIMAL, L_TAX DECIMAL, L_RETURNFLAG CHAR(1), L_LINESTATUS CHAR(1), L_SHIPDATE DATE, L_COMMITDATE DATE, L_RECEIPTDATE DATE, L_SHIPINSTRUCT CHAR(25), L_SHIPMODE CHAR(10), L_COMMENT	 VARCHAR(44))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/lineitem/';

CREATE EXTERNAL TABLE NATION
(N_NATIONKEY INT, N_NAME CHAR(25), N_REGIONKEY BIGINT, N_COMMENT VARCHAR(152))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/nation/';

CREATE EXTERNAL TABLE REGION
(R_REGIONKEY INT, R_NAME CHAR(25), R_COMMENT VARCHAR(152))
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
LOCATION 's3://<your-bucket-name>/region/';

(4) Test Queries

The following test queries have been tested (not fully tested) and they seemed to work in Athena and Presto.

Query 1

select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice*(1-l_discount)) as sum_disc_price,
sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= date '1998-12-01' - interval '90' day
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;

Query 2:

select
s_acctbal,
s_name,
n_name,
p_partkey,
p_mfgr,
s_address,
s_phone,
s_comment
from
part,
supplier,
partsupp,
nation,
region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and p_size = 15
and p_type like '%BRASS'
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
and ps_supplycost = (
select
min(ps_supplycost)
from
partsupp, supplier,
nation, region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
)
order by
s_acctbal desc,
n_name,
s_name,
p_partkey;

Query 3:

SELECT  L_ORDERKEY, SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)) AS REVENUE, O_ORDERDATE, O_SHIPPRIORITY
FROM CUSTOMER, ORDERS, LINEITEM
WHERE C_MKTSEGMENT = 'BUILDING' AND C_CUSTKEY = O_CUSTKEY AND L_ORDERKEY = O_ORDERKEY AND
 O_ORDERDATE < date '1995-03-15' AND L_SHIPDATE > date '1995-03-15'
GROUP BY L_ORDERKEY, O_ORDERDATE, O_SHIPPRIORITY
ORDER BY REVENUE DESC, O_ORDERDATE;

Query 4:

SELECT O_ORDERPRIORITY, COUNT(*) AS ORDER_COUNT FROM ORDERS
WHERE O_ORDERDATE >= date '1993-07-01' AND O_ORDERDATE < date '1995-01-01'
AND EXISTS (SELECT * FROM LINEITEM WHERE L_ORDERKEY = O_ORDERKEY AND L_COMMITDATE < L_RECEIPTDATE)
GROUP BY O_ORDERPRIORITY
ORDER BY O_ORDERPRIORITY;

Query 5:

select
n_name,
sum(l_extendedprice * (1 - l_discount)) as revenue
from
customer,
orders,
lineitem,
supplier,
nation,
region
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and l_suppkey = s_suppkey
and c_nationkey = s_nationkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'ASIA'
and o_orderdate >= date '1994-01-01'
and o_orderdate < date '1995-01-01' 
group by
n_name
order by
revenue desc;

Query 6:

select
sum(l_extendedprice*l_discount) as revenue
from
lineitem
where
l_shipdate >= date '1994-01-01'
and l_shipdate < date '1995-01-01'
and l_discount between 0.06 - 0.01 and 0.06 + 0.01
and l_quantity < 24;

Query 7:

select
supp_nation,
cust_nation,
l_year, sum(volume) as revenue
from (
select
n1.n_name as supp_nation,
n2.n_name as cust_nation,
extract(year from l_shipdate) as l_year,
l_extendedprice * (1 - l_discount) as volume
from
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
where
s_suppkey = l_suppkey
and o_orderkey = l_orderkey
and c_custkey = o_custkey
and s_nationkey = n1.n_nationkey
and c_nationkey = n2.n_nationkey
and (
(n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
)
and l_shipdate between date '1995-01-01' and date '1996-12-31'
) as shipping
group by
supp_nation,
cust_nation,
l_year
order by
supp_nation,
cust_nation,
l_year;

Query 8:

select
o_year,
sum(case
when nation = 'BRAZIL'
then volume
else 0
end) / sum(volume) as mkt_share
from (
select
extract(year from o_orderdate) as o_year,
l_extendedprice * (1-l_discount) as volume,
n2.n_name as nation
from
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
where
p_partkey = l_partkey
and s_suppkey = l_suppkey
and l_orderkey = o_orderkey
and o_custkey = c_custkey
and c_nationkey = n1.n_nationkey
and n1.n_regionkey = r_regionkey
and r_name = 'AMERICA'
and s_nationkey = n2.n_nationkey
and o_orderdate between date '1995-01-01' and date '1996-12-31'
and p_type = 'ECONOMY ANODIZED STEEL'
) as all_nations
group by
o_year
order by
o_year;

Query 9:

select
nation,
o_year,
sum(amount) as sum_profit
from (
select
n_name as nation,
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
from
part,
supplier,
lineitem,
partsupp,
orders,
nation
where
s_suppkey = l_suppkey
and ps_suppkey = l_suppkey
and ps_partkey = l_partkey
and p_partkey = l_partkey
and o_orderkey = l_orderkey
and s_nationkey = n_nationkey
and p_name like '%green%'
) as profit
group by
nation,
o_year
order by
nation,
o_year desc;

Query 10:

select
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
from
customer,
orders,
lineitem,
nation
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '1993-10-01'
and o_orderdate < date '1995-01-01'
and l_returnflag = 'R'
and c_nationkey = n_nationkey
group by
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
order by
revenue desc;

Query 11:

select
ps_partkey,
sum(ps_supplycost * ps_availqty) as value
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
group by
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.00000001
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
)
order by
value desc;

Query 12:

select
l_shipmode,
sum(case
when o_orderpriority ='1-URGENT'
or o_orderpriority ='2-HIGH'
then 1
else 0
end) as high_line_count,
sum(case
when o_orderpriority <> '1-URGENT'
and o_orderpriority <> '2-HIGH'
then 1
else 0
end) as low_line_count
from
orders,
lineitem
where
o_orderkey = l_orderkey
and l_shipmode in ('MAIL', 'SHIP')
and l_commitdate < l_receiptdate
and l_shipdate < l_commitdate
and l_receiptdate >= date '1994-01-01'
and l_receiptdate < date '1995-01-01'
group by
l_shipmode
order by
l_shipmode;

Query 13:

select
c_count, count(*) as custdist
from (
select
c_custkey,
count(o_orderkey)
from
customer left outer join orders on
c_custkey = o_custkey
and o_comment not like '%special%requests%'
group by
c_custkey
)as c_orders (c_custkey, c_count)
group by
c_count
order by
custdist desc,
c_count desc;

Query 14:

select
100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice*(1-l_discount)
else 0
end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
from
lineitem,
part
where
l_partkey = p_partkey
and l_shipdate >= date '1995-09-01'
and l_shipdate < date '1995-10-01';

Query 15:

Query 16:

select
p_brand,
p_type,
p_size,
count(distinct ps_suppkey) as supplier_cnt
from
partsupp,
part
where
p_partkey = ps_partkey
and p_brand <> 'Brand#45'
and p_type not like 'MEDIUM POLISHED%'
and p_size in (49,14,23,45,19,3,36,9)
and ps_suppkey not in (
select
s_suppkey
from
supplier
where
s_comment like '%Customer%Complaints%'
)
group by
p_brand,
p_type,
p_size
order by
supplier_cnt desc,
p_brand,
p_type,
p_size;

Query 17:

select
sum(l_extendedprice) / 7.0 as avg_yearly
from
lineitem,
part
where
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container = 'MED BOX'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
lineitem
where
l_partkey = p_partkey
);

Query 18:

select
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice,
sum(l_quantity)
from
customer,
orders,
lineitem
where
o_orderkey in (
select
l_orderkey
from
lineitem
group by
l_orderkey having
sum(l_quantity) > 300
)
and c_custkey = o_custkey
and o_orderkey = l_orderkey
group by
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice
order by
o_totalprice desc,
o_orderdate;

Query 19:

select
sum(l_extendedprice * (1 - l_discount) ) as revenue
from
lineitem,
part
where
(
p_partkey = l_partkey
and p_brand = 'Brand#12'
and p_container in ( 'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
and l_quantity >= 1 and l_quantity <= 1 + 10
and p_size between 1 and 5
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
and l_quantity >= 10 and l_quantity <= 10 + 10
and p_size between 1 and 10
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#34'
and p_container in ( 'LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
and l_quantity >= 20 and l_quantity <= 20 + 10
and p_size between 1 and 15
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
);

Query 20:

select
s_name,
s_address
from
supplier, nation
where
s_suppkey in (
select
ps_suppkey
from
partsupp
where
ps_partkey in (
select
p_partkey
from
part
where
p_name like 'forest%'
)
and ps_availqty > (
select
0.5 * sum(l_quantity)
from
lineitem
where
l_partkey = ps_partkey
and l_suppkey = ps_suppkey
and l_shipdate >= date '1994-01-01'
and l_shipdate < date '1995-01-01'
)
)
and s_nationkey = n_nationkey
and n_name = 'CANADA'
order by
s_name;

Query 21:

select
s_name,
count(*) as numwait
from
supplier,
lineitem l1,
orders,
nation
where
s_suppkey = l1.l_suppkey
and o_orderkey = l1.l_orderkey
and o_orderstatus = 'F'
and l1.l_receiptdate > l1.l_commitdate
and exists (
select
*
from
lineitem l2
where
l2.l_orderkey = l1.l_orderkey
and l2.l_suppkey <> l1.l_suppkey
)
and not exists (
select
*
from
lineitem l3
where
l3.l_orderkey = l1.l_orderkey
and l3.l_suppkey <> l1.l_suppkey
and l3.l_receiptdate > l3.l_commitdate
)
and s_nationkey = n_nationkey
and n_name = 'SAUDI ARABIA'
group by
s_name
order by
numwait desc,
s_name;

Query 22:

SELECT CNTRYCODE, COUNT(*) AS NUMCUST, SUM(C_ACCTBAL) AS TOTACCTBAL
FROM (SELECT SUBSTRING(C_PHONE,1,2) AS CNTRYCODE, C_ACCTBAL
  FROM CUSTOMER WHERE SUBSTRING(C_PHONE,1,2) IN ('13', '31', '23', '29', '30', '18', '17') AND
  C_ACCTBAL > (SELECT AVG(C_ACCTBAL) FROM CUSTOMER WHERE C_ACCTBAL > 0.00 AND
   SUBSTRING(C_PHONE,1,2) IN ('13', '31', '23', '29', '30', '18', '17')) AND
  NOT EXISTS ( SELECT * FROM ORDERS WHERE O_CUSTKEY = C_CUSTKEY)) AS CUSTSALE
GROUP BY CNTRYCODE
ORDER BY CNTRYCODE;