Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tpch beeline support #2

Open
wants to merge 7 commits into
base: hdp3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ddl-tpch/bin_flat/analyze.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use ${DB};
analyze table nation compute statistics for columns;
analyze table region compute statistics for columns;
analyze table supplier compute statistics for columns;
Expand Down
96 changes: 96 additions & 0 deletions ddl-tpch/bin_flat_ext/alltables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
create database if not exists ${DB};
use ${DB};

drop table if exists lineitem;
create external table lineitem
(L_ORDERKEY BIGINT,
L_PARTKEY BIGINT,
L_SUPPKEY BIGINT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE STRING,
L_COMMITDATE STRING,
L_RECEIPTDATE STRING,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/lineitem';

drop table if exists part;
create external table part (P_PARTKEY BIGINT,
P_NAME STRING,
P_MFGR STRING,
P_BRAND STRING,
P_TYPE STRING,
P_SIZE INT,
P_CONTAINER STRING,
P_RETAILPRICE DOUBLE,
P_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/part/';

drop table if exists supplier;
create external table supplier (S_SUPPKEY BIGINT,
S_NAME STRING,
S_ADDRESS STRING,
S_NATIONKEY BIGINT,
S_PHONE STRING,
S_ACCTBAL DOUBLE,
S_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/supplier/';

drop table if exists partsupp;
create external table partsupp (PS_PARTKEY BIGINT,
PS_SUPPKEY BIGINT,
PS_AVAILQTY INT,
PS_SUPPLYCOST DOUBLE,
PS_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION'${LOCATION}/partsupp';

drop table if exists nation;
create external table nation (N_NATIONKEY BIGINT,
N_NAME STRING,
N_REGIONKEY BIGINT,
N_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/nation';

drop table if exists region;
create external table region (R_REGIONKEY BIGINT,
R_NAME STRING,
R_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/region';

drop table if exists customer;
create external table customer (C_CUSTKEY BIGINT,
C_NAME STRING,
C_ADDRESS STRING,
C_NATIONKEY BIGINT,
C_PHONE STRING,
C_ACCTBAL DOUBLE,
C_MKTSEGMENT STRING,
C_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/customer';

drop table if exists orders;
create external table orders (O_ORDERKEY BIGINT,
O_CUSTKEY BIGINT,
O_ORDERSTATUS STRING,
O_TOTALPRICE DOUBLE,
O_ORDERDATE STRING,
O_ORDERPRIORITY STRING,
O_CLERK STRING,
O_SHIPPRIORITY INT,
O_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
LOCATION '${LOCATION}/orders';
9 changes: 9 additions & 0 deletions ddl-tpch/bin_flat_ext/analyze.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use ${DB};
analyze table nation compute statistics for columns;
analyze table region compute statistics for columns;
analyze table supplier compute statistics for columns;
analyze table part compute statistics for columns;
analyze table partsupp compute statistics for columns;
analyze table customer compute statistics for columns;
analyze table orders compute statistics for columns;
analyze table lineitem compute statistics for columns;
14 changes: 14 additions & 0 deletions ddl-tpch/bin_flat_ext/customer.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
create database if not exists ${DB};
use ${DB};

drop table if exists customer;

create external table customer (C_CUSTKEY BIGINT,
C_NAME STRING,
C_ADDRESS STRING,
C_NATIONKEY BIGINT,
C_PHONE STRING,
C_ACCTBAL DOUBLE,
C_MKTSEGMENT STRING,
C_COMMENT STRING) stored as ORC;
insert overwrite table customer select * from ${SOURCE}.customer cluster by C_MKTSEGMENT;
25 changes: 25 additions & 0 deletions ddl-tpch/bin_flat_ext/lineitem.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
create database if not exists ${DB};
use ${DB};

drop table if exists lineitem;

create external table lineitem
(L_ORDERKEY BIGINT,
L_PARTKEY BIGINT,
L_SUPPKEY BIGINT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE STRING,
L_COMMITDATE STRING,
L_RECEIPTDATE STRING,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING)
stored as ORC;

insert overwrite table lineitem select * from ${SOURCE}.lineitem cluster by L_SHIPDATE;
12 changes: 12 additions & 0 deletions ddl-tpch/bin_flat_ext/nation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
create database if not exists ${DB};
use ${DB};

drop table if exists nation;

create external table nation (N_NATIONKEY BIGINT,
N_NAME STRING,
N_REGIONKEY BIGINT,
N_COMMENT STRING)
stored as ORC;

insert overwrite table nation select distinct * from ${SOURCE}.nation;
17 changes: 17 additions & 0 deletions ddl-tpch/bin_flat_ext/orders.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
create database if not exists ${DB};
use ${DB};

drop table if exists orders;

create external table orders (O_ORDERKEY BIGINT,
O_CUSTKEY BIGINT,
O_ORDERSTATUS STRING,
O_TOTALPRICE DOUBLE,
O_ORDERDATE STRING,
O_ORDERPRIORITY STRING,
O_CLERK STRING,
O_SHIPPRIORITY INT,
O_COMMENT STRING)
stored as ORC;

insert overwrite table orders select * from ${SOURCE}.orders cluster by o_orderdate;
16 changes: 16 additions & 0 deletions ddl-tpch/bin_flat_ext/part.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
create database if not exists ${DB};
use ${DB};

drop table if exists part;

create external table part (P_PARTKEY BIGINT,
P_NAME STRING,
P_MFGR STRING,
P_BRAND STRING,
P_TYPE STRING,
P_SIZE INT,
P_CONTAINER STRING,
P_RETAILPRICE DOUBLE,
P_COMMENT STRING)
stored as ORC;
insert overwrite table part select * from ${SOURCE}.part cluster by p_brand;
11 changes: 11 additions & 0 deletions ddl-tpch/bin_flat_ext/partsupp.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
create database if not exists ${DB};
use ${DB};

drop table if exists partsupp;

create external table partsupp (PS_PARTKEY BIGINT,
PS_SUPPKEY BIGINT,
PS_AVAILQTY INT,
PS_SUPPLYCOST DOUBLE,
PS_COMMENT STRING) stored as ORC;
insert overwrite table partsupp select * from ${SOURCE}.partsupp cluster by PS_SUPPKEY;
10 changes: 10 additions & 0 deletions ddl-tpch/bin_flat_ext/region.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
create database if not exists ${DB};
use ${DB};

drop table if exists region;

create external table region (R_REGIONKEY BIGINT,
R_NAME STRING,
R_COMMENT STRING) stored as ORC;

insert overwrite table region select distinct * from ${SOURCE}.region;
15 changes: 15 additions & 0 deletions ddl-tpch/bin_flat_ext/supplier.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
create database if not exists ${DB};
use ${DB};

drop table if exists supplier;

create external table supplier (S_SUPPKEY BIGINT,
S_NAME STRING,
S_ADDRESS STRING,
S_NATIONKEY BIGINT,
S_PHONE STRING,
S_ACCTBAL DOUBLE,
S_COMMENT STRING)
stored as ORC;

insert overwrite table supplier select * from ${SOURCE}.supplier cluster by s_nationkey, s_suppkey;
1 change: 1 addition & 0 deletions ddl-tpch/bin_partitioned/analyze.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use ${DB};
analyze table nation compute statistics for columns;
analyze table region compute statistics for columns;
analyze table supplier compute statistics for columns;
Expand Down
14 changes: 0 additions & 14 deletions settings/load-flat.sql
Original file line number Diff line number Diff line change
@@ -1,15 +1 @@
--set hive.enforce.bucketing=true;
--set hive.enforce.sorting=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000000;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.created.files=1000000;

set mapreduce.input.fileinputformat.split.minsize=240000000;
set mapreduce.input.fileinputformat.split.maxsize=240000000;
set mapreduce.input.fileinputformat.split.minsize.per.node=240000000;
set mapreduce.input.fileinputformat.split.minsize.per.rack=240000000;
--set hive.exec.parallel=true;
set hive.stats.autogather=true;
set hive.support.concurrency=false;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
4 changes: 2 additions & 2 deletions tpch-gen/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.4.0</version>
<version>3.1.0</version>
<scope>compile</scope>
</dependency>
<dependency>
Expand Down Expand Up @@ -58,7 +58,7 @@
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<classpathPrefix>lib/</classpathPrefix>
<mainClass>org.notmysock.tpch.GenTable</mainClass>
</manifest>
</archive>
Expand Down
19 changes: 11 additions & 8 deletions tpch-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,17 @@ if [ $? -ne 0 ]; then
fi
echo "TPC-H text data generation complete."

HIVE="beeline -n hive -u 'jdbc:hive2://localhost:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2?tez.queue.name=default' "

# Create the text/flat tables as external tables. These will be later be converted to ORCFile.
echo "Loading text data into external tables."
runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
runcommand "$HIVE -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql --hivevar DB=tpch_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE}"

# Create the optimized tables.
i=1
total=8

if test $SCALE -le 1000; then
if test $SCALE -le 1000; then
SCHEMA_TYPE=flat
else
SCHEMA_TYPE=partitioned
Expand All @@ -78,14 +80,15 @@ DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE}
MAX_REDUCERS=2600 # ~7 years of data
REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})


for t in ${TABLES}
do
echo "Optimizing table $t ($i/$total)."
COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
-d DB=${DATABASE} \
-d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
-d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \
-d FILE=orc"
COMMAND="$HIVE -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
--hivevar DB=${DATABASE} \
--hivevar SOURCE=tpch_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \
--hivevar SCALE=${SCALE} --hivevar REDUCERS=${REDUCERS} \
--hivevar FILE=orc"
runcommand "$COMMAND"
if [ $? -ne 0 ]; then
echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
Expand All @@ -94,6 +97,6 @@ do
i=`expr $i + 1`
done

hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE};
$HIVE -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --hivevar DB=${DATABASE};

echo "Data loaded into database ${DATABASE}."