From c49725094a1a0395aeaeb4cb771ddfbd1566bdc2 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Tue, 14 Aug 2018 15:03:16 -0700 Subject: [PATCH 01/15] added files for Hive Docker image --- tools/hive/README.md | 63 ++ tools/hive/download_deps.sh | 21 + tools/hive/hadoop-2.6.0/Dockerfile | 53 ++ tools/hive/hadoop-2.6.0/LICENSE | 202 ++++++ tools/hive/hadoop-2.6.0/conf/mapred-site.xml | 6 + tools/hive/hadoop-2.6.0/scripts/bootstrap.sh | 24 + .../templates/core-site.xml.template | 6 + .../templates/hdfs-site.xml.template | 6 + .../templates/yarn-site.xml.template | 64 ++ tools/hive/hive_pg/Dockerfile | 78 +++ tools/hive/hive_pg/conf/hive-log4j.properties | 88 +++ tools/hive/hive_pg/conf/log4j.properties | 6 + tools/hive/hive_pg/conf/postgresql.conf | 630 ++++++++++++++++++ tools/hive/hive_pg/scripts/bootstrap.sh | 49 ++ .../hive_pg/templates/hive-site.xml.template | 154 +++++ tools/hive/kerberos/Dockerfile | 20 + tools/hive/kerberos/conf/krb5.conf | 10 + .../kerberos/marathon/hdfs-hive-kerberos.json | 39 ++ .../kerberos/templates/core-site.xml.template | 28 + .../kerberos/templates/hdfs-site.xml.template | 58 ++ .../kerberos/templates/hive-site.xml.template | 197 ++++++ .../kerberos/templates/yarn-site.xml.template | 85 +++ tools/hive/ubuntu/Dockerfile | 60 ++ tools/hive/ubuntu/base.env | 19 + tools/hive/ubuntu/conf/ssh_config | 5 + tools/hive/ubuntu/scripts/bootstrap.sh | 12 + 26 files changed, 1983 insertions(+) create mode 100644 tools/hive/README.md create mode 100755 tools/hive/download_deps.sh create mode 100644 tools/hive/hadoop-2.6.0/Dockerfile create mode 100644 tools/hive/hadoop-2.6.0/LICENSE create mode 100644 tools/hive/hadoop-2.6.0/conf/mapred-site.xml create mode 100755 tools/hive/hadoop-2.6.0/scripts/bootstrap.sh create mode 100644 tools/hive/hadoop-2.6.0/templates/core-site.xml.template create mode 100644 tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template create mode 100644 tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template create mode 100644 tools/hive/hive_pg/Dockerfile create mode 100644 tools/hive/hive_pg/conf/hive-log4j.properties create mode 100755 tools/hive/hive_pg/conf/log4j.properties create mode 100644 tools/hive/hive_pg/conf/postgresql.conf create mode 100644 tools/hive/hive_pg/scripts/bootstrap.sh create mode 100755 tools/hive/hive_pg/templates/hive-site.xml.template create mode 100644 tools/hive/kerberos/Dockerfile create mode 100644 tools/hive/kerberos/conf/krb5.conf create mode 100644 tools/hive/kerberos/marathon/hdfs-hive-kerberos.json create mode 100644 tools/hive/kerberos/templates/core-site.xml.template create mode 100644 tools/hive/kerberos/templates/hdfs-site.xml.template create mode 100755 tools/hive/kerberos/templates/hive-site.xml.template create mode 100644 tools/hive/kerberos/templates/yarn-site.xml.template create mode 100644 tools/hive/ubuntu/Dockerfile create mode 100644 tools/hive/ubuntu/base.env create mode 100644 tools/hive/ubuntu/conf/ssh_config create mode 100644 tools/hive/ubuntu/scripts/bootstrap.sh diff --git a/tools/hive/README.md b/tools/hive/README.md new file mode 100644 index 00000000..88d37861 --- /dev/null +++ b/tools/hive/README.md @@ -0,0 +1,63 @@ +# Cloudera Hadoop and Hive Docker Image with Kerberos + + +This is a Hadoop Docker image running CDH5 versions of Hadoop and Hive, all in one container. There is a separate Kerberos image in which Hadoop and Hive use Kerberos for authentication. Adapted from https://github.com/tilakpatidar/cdh5_hive_postgres and based on Ubuntu (trusty). + +## Current Version +* Hadoop 2.6.0 + +## Dependencies +The Kerberos image assumes that a KDC has been launched by the dcos-commons kdc.py script. + +## Build the image +Download dependencies: +``` +./download_deps.sh +``` + +Build the Ubuntu base image: +``` +cd ubuntu +docker build -t cdh5-ubuntu . +``` + +Build the Hadoop image: +``` +cd ../hadoop-2.6.0 +docker build -t cdh5-hadoop . +``` + +Build the Hadoop + Hive image: +``` +cd ../hive_pg +docker build -t cdh5-hive . +``` + +Build the Kerberized Hadoop + Hive image: +``` +cd ../kerberos +docker build -t cdh5-hive-kerberos . +``` + +## Run the Kerberos image in DC/OS +First, deploy a KDC via the dcos-commons kdc.py utility. See [the kdc README](https://github.com/mesosphere/dcos-commons/tree/master/tools/kdc) for details. + +From the dcos-commons repo: +``` +PYTHONPATH=testing ./tools/kdc/kdc.py deploy principals.txt +``` + +At a minimum, `principals.txt` should include the following principals (for the Hadoop container hostname, pick any private agent in the cluster): + +``` +hdfs/@LOCAL +HTTP/@LOCAL +yarn/@LOCAL +hive/@LOCAL +``` + +Deploy the Kerberized Hadoop / Hive container via Marathon. (Update the Marathon config's `constraint` field first with the host selected above.) + +``` +dcos marathon app add kerberos/marathon/hdfs-hive-kerberos.json +``` diff --git a/tools/hive/download_deps.sh b/tools/hive/download_deps.sh new file mode 100755 index 00000000..7a8e2c0d --- /dev/null +++ b/tools/hive/download_deps.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -x +set -e +source ubuntu/base.env + +mkdir hadoop-2.6.0/deps +mkdir hive_pg/deps + +#download cdh +echo "wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz | tar -xz -C /usr/local/" +wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz +mv hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz hadoop-2.6.0/deps/ + +#download native hadoop libs +echo "wget http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-2.6.0.tar" +wget http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-2.6.0.tar +mv hadoop-native-64-2.6.0.tar hadoop-2.6.0/deps/ + +echo "wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz" +wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz +mv hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz hive_pg/deps/ diff --git a/tools/hive/hadoop-2.6.0/Dockerfile b/tools/hive/hadoop-2.6.0/Dockerfile new file mode 100644 index 00000000..43f8e613 --- /dev/null +++ b/tools/hive/hadoop-2.6.0/Dockerfile @@ -0,0 +1,53 @@ +# Creates pseudo distributed hadoop 2.6.0 in ubuntu +FROM cdh5-ubuntu + +USER root + +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV HADOOP_VERSION 2.6.0 +ENV CDH_VERSION 5 +ENV CDH_EXACT_VERSION 5.11.0 +ENV HADOOP_HOME /usr/local/hadoop +ENV HADOOP_PREFIX /usr/local/hadoop +ENV HADOOP_COMMON_HOME /usr/local/hadoop +ENV HADOOP_HDFS_HOME /usr/local/hadoop +ENV HADOOP_MAPRED_HOME /usr/local/hadoop +ENV HADOOP_YARN_HOME /usr/local/hadoop +ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop + +ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin + +ADD ./deps/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local +RUN cd /usr/local && ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop + +RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +# copy hadoop site xml files +RUN mkdir $HADOOP_PREFIX/input +RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input + +# pseudo distributed configurations +ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template +ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.template +ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml +ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template + +# format namenode +RUN $HADOOP_PREFIX/bin/hdfs namenode -format + +# fixing the libhadoop.so +RUN rm -rf /usr/local/hadoop/lib/native/* +ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ + +# add and set permissions for bootstrap script +ADD scripts/bootstrap.sh /etc/hadoop-bootstrap.sh +RUN chown root:root /etc/hadoop-bootstrap.sh +RUN chmod 700 /etc/hadoop-bootstrap.sh + +RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh + +#for exposed ports refer +#https://www.cloudera.com/documentation/enterprise/5-4-x/topics/cdh_ig_ports_cdh5.html + +EXPOSE 50010 50020 50070 50075 50090 8020 9000 10020 19888 8030 8031 8032 8033 8040 8042 8088 \ No newline at end of file diff --git a/tools/hive/hadoop-2.6.0/LICENSE b/tools/hive/hadoop-2.6.0/LICENSE new file mode 100644 index 00000000..e06d2081 --- /dev/null +++ b/tools/hive/hadoop-2.6.0/LICENSE @@ -0,0 +1,202 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/tools/hive/hadoop-2.6.0/conf/mapred-site.xml b/tools/hive/hadoop-2.6.0/conf/mapred-site.xml new file mode 100644 index 00000000..dba582f1 --- /dev/null +++ b/tools/hive/hadoop-2.6.0/conf/mapred-site.xml @@ -0,0 +1,6 @@ + + + mapreduce.framework.name + yarn + + diff --git a/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh b/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh new file mode 100755 index 00000000..7df86ffe --- /dev/null +++ b/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -x +/usr/sbin/sshd +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +# templating of config files +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml + + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/tools/hive/hadoop-2.6.0/templates/core-site.xml.template b/tools/hive/hadoop-2.6.0/templates/core-site.xml.template new file mode 100644 index 00000000..3576bbd5 --- /dev/null +++ b/tools/hive/hadoop-2.6.0/templates/core-site.xml.template @@ -0,0 +1,6 @@ + + + fs.defaultFS + hdfs://HOSTNAME:9000 + + diff --git a/tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template b/tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template new file mode 100644 index 00000000..82c525ea --- /dev/null +++ b/tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template @@ -0,0 +1,6 @@ + + + dfs.replication + 1 + + diff --git a/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template b/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template new file mode 100644 index 00000000..37c1850a --- /dev/null +++ b/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template @@ -0,0 +1,64 @@ + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + + + Number of seconds after an application finishes before the nodemanager's + DeletionService will delete the application's localized file directory + and log directory. + + To diagnose Yarn application problems, set this property's value large + enough (for example, to 600 = 10 minutes) to permit examination of these + directories. After changing the property's value, you must restart the + nodemanager in order for it to have an effect. + + The roots of Yarn applications' work directories is configurable with + the yarn.nodemanager.local-dirs property (see below), and the roots + of the Yarn applications' log directories is configurable with the + yarn.nodemanager.log-dirs property (see also below). + + yarn.nodemanager.delete.debug-delay-sec + 600 + + + + Indicate to clients whether Timeline service is enabled or not. + If enabled, the TimelineClient library used by end-users will post entities + and events to the Timeline server. + yarn.timeline-service.enabled + true + + + + The hostname of the Timeline service web application. + yarn.timeline-service.hostname + 0.0.0.0 + + + + Enables cross-origin support (CORS) for web services where + cross-origin web response headers are needed. For example, javascript making + a web services request to the timeline server. + yarn.timeline-service.http-cross-origin.enabled + true + + + + yarn.resourcemanager.webapp.cross-origin.enabled + true + + + + Publish YARN information to Timeline Server + yarn.resourcemanager.system-metrics-publisher.enabled + true + + diff --git a/tools/hive/hive_pg/Dockerfile b/tools/hive/hive_pg/Dockerfile new file mode 100644 index 00000000..dc506780 --- /dev/null +++ b/tools/hive/hive_pg/Dockerfile @@ -0,0 +1,78 @@ +FROM cdh5-hadoop + +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV HIVE_HOME /usr/local/hive +ENV HIVE_CONF /usr/local/hive/conf +ENV HIVE_VERSION 1.1.0 +ENV HADOOP_VERSION 2.6.0 +ENV CDH_VERSION 5 +ENV CDH_EXACT_VERSION 5.11.0 +ENV POSTGRES_VERSION 9.5 +ENV POSTGRESQL_MAIN /var/lib/postgresql/9.5/main/ +ENV POSTGRESQL_CONFIG_FILE /var/lib/postgresql/9.5/main/postgresql.conf +ENV POSTGRESQL_BIN /usr/lib/postgresql/9.5/bin/postgres +ENV PGPASSWORD hive +ENV HADOOP_HOME /usr/local/hadoop +ENV HADOOP_PREFIX /usr/local/hadoop +ENV HADOOP_COMMON_HOME /usr/local/hadoop +ENV HADOOP_HDFS_HOME /usr/local/hadoop +ENV HADOOP_MAPRED_HOME /usr/local/hadoop +ENV HADOOP_YARN_HOME /usr/local/hadoop +ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop + +ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin + +# add hive +ADD ./deps/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local/ +RUN mv /usr/local/hive-${HIVE_VERSION}-cdh${CDH_EXACT_VERSION} /usr/local/hive + + +# add postgresql jdbc jar to classpath +RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar + +# to configure postgres as hive metastore backend +RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" >> /etc/apt/sources.list.d/pgdg.list' +RUN wget -q https://www.postgresql.org/media/keys/ACCC4CF8.asc -O - | sudo apt-key add - +RUN apt-get update -y +RUN apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java + +USER postgres +# initialize hive metastore db +# create metastore db, hive user and assign privileges +RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ &&\ + /etc/init.d/postgresql start &&\ + psql --command "CREATE DATABASE metastore;" &&\ + psql --command "CREATE USER hive WITH PASSWORD 'hive';" && \ + psql --command "ALTER USER hive WITH SUPERUSER;" && \ + psql --command "GRANT ALL PRIVILEGES ON DATABASE metastore TO hive;" && \ + psql -U hive -d metastore -h localhost -f hive-schema-${HIVE_VERSION}.postgres.sql + + +# revert back to default user +USER root + +# disable ssl in postgres.conf +ADD conf/postgresql.conf $POSTGRESQL_MAIN +RUN echo $POSTGRESQL_MAIN +RUN echo $POSTGRESQL_CONFIG_FILE +RUN chown postgres:postgres $POSTGRESQL_CONFIG_FILE +RUN sed -i -e 's/peer/md5/g' /etc/postgresql/$POSTGRES_VERSION/main/pg_hba.conf + + +# copy config, sql, data files to /opt/files +RUN mkdir /opt/files +RUN echo $HIVE_CONF +ADD templates/hive-site.xml.template /opt/files/ +ADD conf/hive-log4j.properties /opt/files/ +ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template +ADD conf/hive-log4j.properties $HIVE_CONF/hive-log4j.properties + +# set permissions for hive bootstrap file +ADD scripts/bootstrap.sh /etc/hive-bootstrap.sh +RUN chown root:root /etc/hive-bootstrap.sh +RUN chmod 700 /etc/hive-bootstrap.sh + +EXPOSE 10000 10001 10002 10003 9083 50111 5432 + +# run bootstrap script +ENTRYPOINT ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/hive_pg/conf/hive-log4j.properties b/tools/hive/hive_pg/conf/hive-log4j.properties new file mode 100644 index 00000000..b258a503 --- /dev/null +++ b/tools/hive/hive_pg/conf/hive-log4j.properties @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hive.log.threshold=ALL +hive.root.logger=INFO,DRFA +hive.log.dir=/tmp/logs/ +hive.log.file=hive.log + +# Define the root logger to the system property "hadoop.root.logger". +log4j.rootLogger=${hive.root.logger}, EventCounter + +# Logging Threshold +log4j.threshold=${hive.log.threshold} + +# +# Daily Rolling File Appender +# +# Use the PidDailyerRollingFileAppend class instead if you want to use separate log files +# for different CLI session. +# +# log4j.appender.DRFA=org.apache.hadoop.hive.ql.log.PidDailyRollingFileAppender + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender + +log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file} + +# Rollver at midnight +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n +log4j.appender.console.encoding=UTF-8 + +#custom logging levels +#log4j.logger.xxx=DEBUG + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.hive.shims.HiveEventCounter + + +log4j.category.DataNucleus=ERROR,DRFA +log4j.category.Datastore=ERROR,DRFA +log4j.category.Datastore.Schema=ERROR,DRFA +log4j.category.JPOX.Datastore=ERROR,DRFA +log4j.category.JPOX.Plugin=ERROR,DRFA +log4j.category.JPOX.MetaData=ERROR,DRFA +log4j.category.JPOX.Query=ERROR,DRFA +log4j.category.JPOX.General=ERROR,DRFA +log4j.category.JPOX.Enhancer=ERROR,DRFA + + +# Silence useless ZK logs +log4j.logger.org.apache.zookeeper.server.NIOServerCnxn=WARN,DRFA +log4j.logger.org.apache.zookeeper.ClientCnxnSocketNIO=WARN,DRFA diff --git a/tools/hive/hive_pg/conf/log4j.properties b/tools/hive/hive_pg/conf/log4j.properties new file mode 100755 index 00000000..bd066b0b --- /dev/null +++ b/tools/hive/hive_pg/conf/log4j.properties @@ -0,0 +1,6 @@ +log4j.rootLogger=${hive.root.logger} +hive.root.logger=INFO,console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/tools/hive/hive_pg/conf/postgresql.conf b/tools/hive/hive_pg/conf/postgresql.conf new file mode 100644 index 00000000..38d855ed --- /dev/null +++ b/tools/hive/hive_pg/conf/postgresql.conf @@ -0,0 +1,630 @@ +# ----------------------------- +# PostgreSQL configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# (The "=" is optional.) Whitespace may be used. Comments are introduced with +# "#" anywhere on a line. The complete list of parameter names and allowed +# values can be found in the PostgreSQL documentation. +# +# The commented-out settings shown in this file represent the default values. +# Re-commenting a setting is NOT sufficient to revert it to the default value; +# you need to reload the server. +# +# This file is read on server startup and when the server receives a SIGHUP +# signal. If you edit the file on a running system, you have to SIGHUP the +# server for the changes to take effect, or use "pg_ctl reload". Some +# parameters, which are marked below, require a server shutdown and restart to +# take effect. +# +# Any parameter can also be given as a command-line option to the server, e.g., +# "postgres -c log_connections=on". Some parameters can be changed at run time +# with the "SET" SQL command. +# +# Memory units: kB = kilobytes Time units: ms = milliseconds +# MB = megabytes s = seconds +# GB = gigabytes min = minutes +# TB = terabytes h = hours +# d = days + + +#------------------------------------------------------------------------------ +# FILE LOCATIONS +#------------------------------------------------------------------------------ + +# The default values of these variables are driven from the -D command-line +# option or PGDATA environment variable, represented here as ConfigDir. + +data_directory = '/var/lib/postgresql/9.5/main' # use data in another directory +# (change requires restart) +hba_file = '/var/lib/postgresql/9.5/pg_hba.conf' # host-based authentication file +# (change requires restart) +ident_file = '/var/lib/postgresql/9.5/pg_ident.conf' # ident configuration file +# (change requires restart) + +# If external_pid_file is not explicitly set, no extra PID file is written. +#external_pid_file = '' # write an extra PID file +# (change requires restart) + + +#------------------------------------------------------------------------------ +# CONNECTIONS AND AUTHENTICATION +#------------------------------------------------------------------------------ + +# - Connection Settings - + +listen_addresses = '*' # what IP address(es) to listen on; +# comma-separated list of addresses; +# defaults to 'localhost'; use '*' for all +# (change requires restart) +port = 5432 # (change requires restart) +max_connections = 100 # (change requires restart) +# Note: Increasing max_connections costs ~400 bytes of shared memory per +# connection slot, plus lock space (see max_locks_per_transaction). +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories +# (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation +# (change requires restart) +#bonjour = off # advertise server via Bonjour +# (change requires restart) +#bonjour_name = '' # defaults to the computer name +# (change requires restart) + +# - Security and Authentication - + +#authentication_timeout = 1min # 1s-600s +ssl = off # (change requires restart) +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers +# (change requires restart) +#ssl_prefer_server_ciphers = on # (change requires restart) +#ssl_ecdh_curve = 'prime256v1' # (change requires restart) +#ssl_cert_file = 'server.crt' # (change requires restart) +#ssl_key_file = 'server.key' # (change requires restart) +#ssl_ca_file = '' # (change requires restart) +#ssl_crl_file = '' # (change requires restart) +#password_encryption = on +#db_user_namespace = off +#row_security = on + +# GSSAPI using Kerberos +#krb_server_keyfile = '' +#krb_caseins_users = off + +# - TCP Keepalives - +# see "man 7 tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; +# 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; +# 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; +# 0 selects the system default + + +#------------------------------------------------------------------------------ +# RESOURCE USAGE (except WAL) +#------------------------------------------------------------------------------ + +# - Memory - + +shared_buffers = 128MB # min 128kB +# (change requires restart) +#huge_pages = try # on, off, or try +# (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature +# (change requires restart) +# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory +# per transaction slot, plus lock space (see max_locks_per_transaction). +# It is not advisable to set max_prepared_transactions nonzero unless you +# actively intend to use prepared transactions. +#work_mem = 4MB # min 64kB +#maintenance_work_mem = 64MB # min 1MB +#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem +#max_stack_depth = 2MB # min 100kB +#dynamic_shared_memory_type = posix # the default is the first option +# supported by the operating system: +# posix +# sysv +# windows +# mmap +# use none to disable dynamic shared memory + +# - Disk - + +#temp_file_limit = -1 # limits per-session temp file space +# in kB, or -1 for no limit + +# - Kernel Resource Usage - + +#max_files_per_process = 1000 # min 25 +# (change requires restart) +#shared_preload_libraries = '' # (change requires restart) + +# - Cost-Based Vacuum Delay - + +#vacuum_cost_delay = 0 # 0-100 milliseconds +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 10 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits + +# - Background Writer - + +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round + +# - Asynchronous Behavior - + +#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +#max_worker_processes = 8 + + +#------------------------------------------------------------------------------ +# WRITE AHEAD LOG +#------------------------------------------------------------------------------ + +# - Settings - + +#wal_level = minimal # minimal, archive, hot_standby, or logical +# (change requires restart) +#fsync = on # turns forced synchronization on or off +#synchronous_commit = on # synchronization level; +# off, local, remote_write, or on +#wal_sync_method = fsync # the default is the first option +# supported by the operating system: +# open_datasync +# fdatasync (default on Linux) +# fsync +# fsync_writethrough +# open_sync +#full_page_writes = on # recover from partial page writes +#wal_compression = off # enable compression of full-page writes +#wal_log_hints = off # also do full page writes of non-critical updates +# (change requires restart) +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers +# (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds + +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 1-1000 + +# - Checkpoints - + +#checkpoint_timeout = 5min # range 30s-1h +#max_wal_size = 1GB +#min_wal_size = 80MB +#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_warning = 30s # 0 disables + +# - Archiving - + +#archive_mode = off # enables archiving; off, on, or always +# (change requires restart) +#archive_command = '' # command to use to archive a logfile segment +# placeholders: %p = path of file to archive +# %f = file name only +# e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_timeout = 0 # force a logfile segment switch after this +# number of seconds; 0 disables + + +#------------------------------------------------------------------------------ +# REPLICATION +#------------------------------------------------------------------------------ + +# - Sending Server(s) - + +# Set these on the master and on any standby that will send replication data. + +#max_wal_senders = 0 # max number of walsender processes +# (change requires restart) +#wal_keep_segments = 0 # in logfile segments, 16MB each; 0 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables + +#max_replication_slots = 0 # max number of replication slots +# (change requires restart) +#track_commit_timestamp = off # collect timestamp of transaction commit +# (change requires restart) + +# - Master Server - + +# These settings are ignored on a standby server. + +#synchronous_standby_names = '' # standby servers that provide sync rep +# comma-separated list of application_name +# from standby(s); '*' = all +#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed + +# - Standby Servers - + +# These settings are ignored on a master server. + +#hot_standby = off # "on" allows queries during recovery +# (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries +# when reading WAL from archive; +# -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries +# when reading streaming WAL; +# -1 allows indefinite delay +#wal_receiver_status_interval = 10s # send replies at least this often +# 0 disables +#hot_standby_feedback = off # send info from standby to prevent +# query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for +# communication from master +# in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to +# retrieve WAL after a failed attempt + + +#------------------------------------------------------------------------------ +# QUERY TUNING +#------------------------------------------------------------------------------ + +# - Planner Method Configuration - + +#enable_bitmapscan = on +#enable_hashagg = on +#enable_hashjoin = on +#enable_indexscan = on +#enable_indexonlyscan = on +#enable_material = on +#enable_mergejoin = on +#enable_nestloop = on +#enable_seqscan = on +#enable_sort = on +#enable_tidscan = on + +# - Planner Cost Constants - + +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#effective_cache_size = 4GB + +# - Genetic Query Optimizer - + +#geqo = on +#geqo_threshold = 12 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 + +# - Other Planner Options - + +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#from_collapse_limit = 8 +#join_collapse_limit = 8 # 1 disables collapsing of explicit +# JOIN clauses + + +#------------------------------------------------------------------------------ +# ERROR REPORTING AND LOGGING +#------------------------------------------------------------------------------ + +# - Where to Log - + +log_destination = 'stderr' # Valid values are combinations of +# stderr, csvlog, syslog, and eventlog, +# depending on platform. csvlog +# requires logging_collector to be on. + +# This is used when logging to stderr: +logging_collector = on # Enable capturing of stderr and csvlog +# into log files. Required to be on for +# csvlogs. +# (change requires restart) + +# These are only used if logging_collector is on: +log_directory = 'pg_log' # directory where log files are written, +# can be absolute or relative to PGDATA +log_filename = 'postgresql-%a.log' # log file name pattern, +# can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, +# begin with 0 to use octal notation +log_truncate_on_rotation = on # If on, an existing log file with the +# same name as the new log file will be +# truncated rather than appended to. +# But such truncation only occurs on +# time-driven rotation, not on restarts +# or size-driven rotation. Default is +# off, meaning append to existing files +# in all cases. +log_rotation_age = 1d # Automatic rotation of logfiles will +# happen after that time. 0 disables. +log_rotation_size = 0 # Automatic rotation of logfiles will +# happen after that much log output. +# 0 disables. + +# These are relevant when logging to syslog: +#syslog_facility = 'LOCAL0' +#syslog_ident = 'postgres' + +# This is only relevant when logging to eventlog (win32): +#event_source = 'PostgreSQL' + +# - When to Log - + +#client_min_messages = notice # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# log +# notice +# warning +# error + +#log_min_messages = warning # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# info +# notice +# warning +# error +# log +# fatal +# panic + +#log_min_error_statement = error # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# info +# notice +# warning +# error +# log +# fatal +# panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements +# and their durations, > 0 logs only +# statements running at least this number +# of milliseconds + + +# - What to Log - + +#debug_print_parse = off +#debug_print_rewritten = off +#debug_print_plan = off +#debug_pretty_print = on +#log_checkpoints = off +#log_connections = off +#log_disconnections = off +#log_duration = off +#log_error_verbosity = default # terse, default, or verbose messages +#log_hostname = off +log_line_prefix = '< %m >' # special values: +# %a = application name +# %u = user name +# %d = database name +# %r = remote host and port +# %h = remote host +# %p = process ID +# %t = timestamp without milliseconds +# %m = timestamp with milliseconds +# %i = command tag +# %e = SQL state +# %c = session ID +# %l = session line number +# %s = session start timestamp +# %v = virtual transaction ID +# %x = transaction ID (0 if none) +# %q = stop here in non-session +# processes +# %% = '%' +# e.g. '<%u%%%d> ' +#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_statement = 'none' # none, ddl, mod, all +#log_replication_commands = off +#log_temp_files = -1 # log temporary files equal or larger +# than the specified size in kilobytes; +# -1 disables, 0 logs all temp files +#log_timezone = 'GMT' + + +# - Process Title - + +#cluster_name = '' # added to process titles if nonempty +# (change requires restart) +#update_process_title = on + + +#------------------------------------------------------------------------------ +# RUNTIME STATISTICS +#------------------------------------------------------------------------------ + +# - Query/Index Statistics Collector - + +#track_activities = on +#track_counts = on +#track_io_timing = off +#track_functions = none # none, pl, all +#track_activity_query_size = 1024 # (change requires restart) +#stats_temp_directory = 'pg_stat_tmp' + + +# - Statistics Monitoring - + +#log_parser_stats = off +#log_planner_stats = off +#log_executor_stats = off +#log_statement_stats = off + + +#------------------------------------------------------------------------------ +# AUTOVACUUM PARAMETERS +#------------------------------------------------------------------------------ + +#autovacuum = on # Enable autovacuum subprocess? 'on' +# requires track_counts to also be on. +#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and +# their durations, > 0 logs only +# actions running at least this number +# of milliseconds. +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses +# (change requires restart) +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before +# vacuum +#autovacuum_analyze_threshold = 50 # min number of row updates before +# analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum +# (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age +# before forced vacuum +# (change requires restart) +#autovacuum_vacuum_cost_delay = 20ms # default vacuum cost delay for +# autovacuum, in milliseconds; +# -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for +# autovacuum, -1 means use +# vacuum_cost_limit + + +#------------------------------------------------------------------------------ +# CLIENT CONNECTION DEFAULTS +#------------------------------------------------------------------------------ + +# - Statement Behavior - + +#search_path = '"$user", public' # schema names +#default_tablespace = '' # a tablespace name, '' uses the default +#temp_tablespaces = '' # a list of tablespace names, '' uses +# only default tablespace +#check_function_bodies = on +#default_transaction_isolation = 'read committed' +#default_transaction_read_only = off +#default_transaction_deferrable = off +#session_replication_role = 'origin' +#statement_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#vacuum_freeze_min_age = 50000000 +#vacuum_freeze_table_age = 150000000 +#vacuum_multixact_freeze_min_age = 5000000 +#vacuum_multixact_freeze_table_age = 150000000 +#bytea_output = 'hex' # hex, escape +#xmlbinary = 'base64' +#xmloption = 'content' +#gin_fuzzy_search_limit = 0 +#gin_pending_list_limit = 4MB + +# - Locale and Formatting - + +#datestyle = 'iso, mdy' +#intervalstyle = 'postgres' +#timezone = 'GMT' +#timezone_abbreviations = 'Default' # Select the set of available time zone +# abbreviations. Currently, there are +# Default +# Australia (historical usage) +# India +# You can create your own file in +# share/timezonesets/. +#extra_float_digits = 0 # min -15, max 3 +#client_encoding = sql_ascii # actually, defaults to database +# encoding + +# These settings are initialized by initdb, but they can be changed. +#lc_messages = 'C' # locale for system error message +# strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting + +# default configuration for text search +#default_text_search_config = 'pg_catalog.simple' + +# - Other Defaults - + +#dynamic_library_path = '$libdir' +#local_preload_libraries = '' +#session_preload_libraries = '' + + +#------------------------------------------------------------------------------ +# LOCK MANAGEMENT +#------------------------------------------------------------------------------ + +#deadlock_timeout = 1s +#max_locks_per_transaction = 64 # min 10 +# (change requires restart) +# Note: Each lock table slot uses ~270 bytes of shared memory, and there are +# max_locks_per_transaction * (max_connections + max_prepared_transactions) +# lock table slots. +#max_pred_locks_per_transaction = 64 # min 10 +# (change requires restart) + + +#------------------------------------------------------------------------------ +# VERSION/PLATFORM COMPATIBILITY +#------------------------------------------------------------------------------ + +# - Previous PostgreSQL Versions - + +#array_nulls = on +#backslash_quote = safe_encoding # on, off, or safe_encoding +#default_with_oids = off +#escape_string_warning = on +#lo_compat_privileges = off +#operator_precedence_warning = off +#quote_all_identifiers = off +#sql_inheritance = on +#standard_conforming_strings = on +#synchronize_seqscans = on + +# - Other Platforms and Clients - + +#transform_null_equals = off + + +#------------------------------------------------------------------------------ +# ERROR HANDLING +#------------------------------------------------------------------------------ + +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? + + +#------------------------------------------------------------------------------ +# CONFIG FILE INCLUDES +#------------------------------------------------------------------------------ + +# These options allow settings to be loaded from files other than the +# default postgresql.conf. + +#include_dir = 'conf.d' # include files ending in '.conf' from +# directory 'conf.d' +#include_if_exists = 'exists.conf' # include file only if it exists +#include = 'special.conf' # include file + + +#------------------------------------------------------------------------------ +# CUSTOMIZED OPTIONS +#------------------------------------------------------------------------------ + +# Add settings for extensions here diff --git a/tools/hive/hive_pg/scripts/bootstrap.sh b/tools/hive/hive_pg/scripts/bootstrap.sh new file mode 100644 index 00000000..15c6451f --- /dev/null +++ b/tools/hive/hive_pg/scripts/bootstrap.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -x +#save all env vars .bashrc for ssh sessions +printenv | cat >> /root/.bashrc + +# hadoop bootstrap +/etc/hadoop-bootstrap.sh -d + +# restart postgresql +sudo /etc/init.d/postgresql restart + +# kinit for kerberos mode +if command -v kinit 2>/dev/null; then + kinit -k -t /usr/local/hadoop/etc/hadoop/hdfs.keytab hdfs@LOCAL +fi + +until hdfs dfs -ls / +do + echo "waiting for hdfs to be ready"; sleep 10; +done + +# create hdfs directories +$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root +hdfs dfs -chown -R hdfs:supergroup /user + +$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /apps/hive/warehouse +hdfs dfs -chown -R hive:supergroup /apps/hive +hdfs dfs -chmod 777 /apps/hive/warehouse + +# altering the hive-site configuration +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hive/conf/hive-site.xml.template > /usr/local/hive/conf/hive-site.xml +sed s/HOSTNAME/$HOSTNAME/ /opt/files/hive-site.xml.template > /opt/files/hive-site.xml + +# start hive metastore server +$HIVE_HOME/bin/hive --service metastore & + +sleep 20 + +# start hive server +$HIVE_HOME/bin/hive --service hiveserver2 & + + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi + +if [[ $1 == "-d" ]]; then + while true; do sleep 10000; done +fi diff --git a/tools/hive/hive_pg/templates/hive-site.xml.template b/tools/hive/hive_pg/templates/hive-site.xml.template new file mode 100755 index 00000000..d22861b4 --- /dev/null +++ b/tools/hive/hive_pg/templates/hive-site.xml.template @@ -0,0 +1,154 @@ + + + hive.metastore.cache.pinobjtypes + Table,Database,Type,FieldSchema,Order + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionUserName + hive + + + hive.auto.convert.join + true + + + fs.hdfs.impl.disable.cache + true + + + fs.file.impl.disable.cache + true + + + hive.metastore.warehouse.dir + /apps/hive/warehouse + + + hive.auto.convert.sortmerge.join + true + + + hive.metastore.client.socket.timeout + 60 + + + hive.optimize.bucketmapjoin + true + + + hive.optimize.bucketmapjoin.sortedmerge + true + + + hive.optimize.index.filter + true + + + hive.auto.convert.join.noconditionaltask.size + 1000000000 + + + hive.auto.convert.join.noconditionaltask + true + + + hive.mapjoin.bucket.cache.size + 10000 + + + hive.vectorized.execution.enabled + true + + + hive.security.authorization.enabled + false + + + hive.optimize.reducededuplication.min.reducer + 4 + + + hive.server2.enable.doAs + true + + + hive.mapred.reduce.tasks.speculative.execution + false + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://localhost/metastore + + + hive.enforce.bucketing + true + + + hive.metastore.execute.setugi + true + + + hive.enforce.sorting + true + + + hive.security.authorization.manager + org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider + + + hive.map.aggr + true + + + hive.optimize.reducededuplication + true + + + + hive.vectorized.execution.enabled + true + + + hive.vectorized.groupby.maxentries + 10000 + + + hive.vectorized.groupby.checkinterval + 10000 + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + + + javax.jdo.option.ConnectionPassword + hive + + + tez.am.node-blacklisting.enabled + false + + + hive.prewarm.numcontainers + 3 + + Controls the number of containers to prewarm for tez (hadoop 2 only) + + + + mapred.tez.java.opts + -Xmx256m + + + hive.tez.container.size + 256 + + + diff --git a/tools/hive/kerberos/Dockerfile b/tools/hive/kerberos/Dockerfile new file mode 100644 index 00000000..fc2bebd4 --- /dev/null +++ b/tools/hive/kerberos/Dockerfile @@ -0,0 +1,20 @@ +FROM cdh5-hive + +# copy kerberized hadoop config files +ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template +ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.template +ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template + +# copy kerberized hive config file +RUN echo $HIVE_CONF +ADD templates/hive-site.xml.template /opt/files/ +ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template + +# krb5.conf +ADD conf/krb5.conf /etc/ + +# install kinit, used in bootstrap script +RUN apt-get install -y krb5-user + +# run bootstrap script +ENTRYPOINT ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/kerberos/conf/krb5.conf b/tools/hive/kerberos/conf/krb5.conf new file mode 100644 index 00000000..08416abb --- /dev/null +++ b/tools/hive/kerberos/conf/krb5.conf @@ -0,0 +1,10 @@ +[libdefaults] +default_realm = LOCAL +dns_lookup_realm = true +dns_lookup_kdc = true +udp_preference_limit = 1 + +[realms] +LOCAL = { + kdc = kdc.marathon.mesos:2500 +} diff --git a/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json b/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json new file mode 100644 index 00000000..d2963654 --- /dev/null +++ b/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json @@ -0,0 +1,39 @@ +{ + "id": "/cdh5-hadoop-hive-kerberos", + "instances": 1, + "cpus": 1, + "mem": 4096, + "user": "root", + "container": { + "type": "MESOS", + "docker": { + "image": "susanxhuynh/cdh5-hive-kerberos:latest", + "forcePullImage": true + }, + "volumes": [ + { + "containerPath": "/usr/local/hadoop/etc/hadoop/hdfs.keytab", + "secret": "keytab", + "hostPath": "" + } + ] + }, + "secrets": { + "keytab": { + "source": "__dcos_base64___keytab" + } + }, + "networks": [ + { + "mode": "host" + } + ], + "requirePorts": false, + "constraints": [ + [ + "hostname", + "IS", + "10.0.1.100" + ] + ] +} diff --git a/tools/hive/kerberos/templates/core-site.xml.template b/tools/hive/kerberos/templates/core-site.xml.template new file mode 100644 index 00000000..a08c4d74 --- /dev/null +++ b/tools/hive/kerberos/templates/core-site.xml.template @@ -0,0 +1,28 @@ + + + fs.defaultFS + hdfs://HOSTNAME:9000 + + + + + hadoop.security.authentication + kerberos + + + + hadoop.security.authorization + true + + + + + hadoop.proxyuser.hive.hosts + * + + + + hadoop.proxyuser.hive.groups + * + + diff --git a/tools/hive/kerberos/templates/hdfs-site.xml.template b/tools/hive/kerberos/templates/hdfs-site.xml.template new file mode 100644 index 00000000..4fa0a560 --- /dev/null +++ b/tools/hive/kerberos/templates/hdfs-site.xml.template @@ -0,0 +1,58 @@ + + + dfs.replication + 1 + + + + + dfs.block.access.token.enable + true + + + + + dfs.namenode.keytab.file + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + dfs.namenode.kerberos.principal + hdfs/HOSTNAME@LOCAL + + + + + dfs.secondary.namenode.keytab.file + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + dfs.secondary.namenode.kerberos.principal + hdfs/HOSTNAME@LOCAL + + + + + dfs.datanode.keytab.file + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + dfs.datanode.kerberos.principal + hdfs/HOSTNAME@LOCAL + + + + + dfs.web.authentication.kerberos.principal + HTTP/HOSTNAME@LOCAL + + + + dfs.web.authentication.kerberos.keytab + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + + ignore.secure.ports.for.testing + true + + diff --git a/tools/hive/kerberos/templates/hive-site.xml.template b/tools/hive/kerberos/templates/hive-site.xml.template new file mode 100755 index 00000000..4ad073bc --- /dev/null +++ b/tools/hive/kerberos/templates/hive-site.xml.template @@ -0,0 +1,197 @@ + + + hive.metastore.cache.pinobjtypes + Table,Database,Type,FieldSchema,Order + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionUserName + hive + + + hive.auto.convert.join + true + + + fs.hdfs.impl.disable.cache + true + + + fs.file.impl.disable.cache + true + + + hive.metastore.warehouse.dir + /apps/hive/warehouse + + + hive.auto.convert.sortmerge.join + true + + + hive.metastore.client.socket.timeout + 60 + + + hive.optimize.bucketmapjoin + true + + + hive.optimize.bucketmapjoin.sortedmerge + true + + + hive.optimize.index.filter + true + + + hive.auto.convert.join.noconditionaltask.size + 1000000000 + + + hive.auto.convert.join.noconditionaltask + true + + + hive.mapjoin.bucket.cache.size + 10000 + + + hive.vectorized.execution.enabled + true + + + hive.security.authorization.enabled + false + + + hive.optimize.reducededuplication.min.reducer + 4 + + + hive.server2.enable.doAs + true + + + hive.mapred.reduce.tasks.speculative.execution + false + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://localhost/metastore + + + hive.enforce.bucketing + true + + + hive.metastore.execute.setugi + true + + + hive.enforce.sorting + true + + + hive.map.aggr + true + + + hive.optimize.reducededuplication + true + + + + hive.vectorized.execution.enabled + true + + + hive.vectorized.groupby.maxentries + 10000 + + + hive.vectorized.groupby.checkinterval + 10000 + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + + + javax.jdo.option.ConnectionPassword + hive + + + tez.am.node-blacklisting.enabled + false + + + hive.prewarm.numcontainers + 3 + + Controls the number of containers to prewarm for tez (hadoop 2 only) + + + + mapred.tez.java.opts + -Xmx256m + + + hive.tez.container.size + 256 + + + + + + hive.server2.authentication + KERBEROS + + + + hive.server2.authentication.kerberos.principal + hive/HOSTNAME@LOCAL + + + + hive.server2.authentication.kerberos.keytab + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + + hive.metastore.sasl.enabled + true + + + + hive.metastore.kerberos.keytab.file + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + + hive.metastore.kerberos.principal + hive/HOSTNAME@LOCAL + + + + hive.security.authorization.createtable.owner.grants + ALL + The set of privileges automatically granted to the owner whenever a table gets created. + + + + hive.users.in.admin.role + hdfs,hive + + + + hive.security.authorization.manager + org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider + + diff --git a/tools/hive/kerberos/templates/yarn-site.xml.template b/tools/hive/kerberos/templates/yarn-site.xml.template new file mode 100644 index 00000000..3ecbd8f6 --- /dev/null +++ b/tools/hive/kerberos/templates/yarn-site.xml.template @@ -0,0 +1,85 @@ + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + + + Number of seconds after an application finishes before the nodemanager's + DeletionService will delete the application's localized file directory + and log directory. + + To diagnose Yarn application problems, set this property's value large + enough (for example, to 600 = 10 minutes) to permit examination of these + directories. After changing the property's value, you must restart the + nodemanager in order for it to have an effect. + + The roots of Yarn applications' work directories is configurable with + the yarn.nodemanager.local-dirs property (see below), and the roots + of the Yarn applications' log directories is configurable with the + yarn.nodemanager.log-dirs property (see also below). + + yarn.nodemanager.delete.debug-delay-sec + 600 + + + + Indicate to clients whether Timeline service is enabled or not. + If enabled, the TimelineClient library used by end-users will post entities + and events to the Timeline server. + yarn.timeline-service.enabled + true + + + + The hostname of the Timeline service web application. + yarn.timeline-service.hostname + 0.0.0.0 + + + + Enables cross-origin support (CORS) for web services where + cross-origin web response headers are needed. For example, javascript making + a web services request to the timeline server. + yarn.timeline-service.http-cross-origin.enabled + true + + + + yarn.resourcemanager.webapp.cross-origin.enabled + true + + + + Publish YARN information to Timeline Server + yarn.resourcemanager.system-metrics-publisher.enabled + true + + + + + yarn.resourcemanager.keytab + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + yarn.resourcemanager.principal + yarn/HOSTNAME@LOCAL + + + + + yarn.nodemanager.keytab + /usr/local/hadoop/etc/hadoop/hdfs.keytab + + + + yarn.nodemanager.principal + yarn/HOSTNAME@LOCAL + + diff --git a/tools/hive/ubuntu/Dockerfile b/tools/hive/ubuntu/Dockerfile new file mode 100644 index 00000000..031acca7 --- /dev/null +++ b/tools/hive/ubuntu/Dockerfile @@ -0,0 +1,60 @@ +FROM ubuntu:trusty + +USER root + +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV HIVE_HOME /usr/local/hive +ENV HADOOP_HOME /usr/local/hadoop + +ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin + +# install dev tools +RUN apt-get update +RUN apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 + +# for running sshd in ubuntu trusty. https://github.com/docker/docker/issues/5704 +RUN mkdir /var/run/sshd +RUN echo 'root:secretpasswd' | chpasswd +RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN echo "ServerName localhost" >> /etc/apache2/apache2.conf +RUN sed -i 's/Listen 80/Listen 9999/g' /etc/apache2/ports.conf + +# passwordless ssh +RUN yes | ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN yes | ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN yes | ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +# fix the 254 error code +RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config +RUN echo "UsePAM no" >> /etc/ssh/sshd_config +RUN echo "Port 2122" >> /etc/ssh/sshd_config +RUN /usr/sbin/sshd + +# ssh client config +ADD conf/ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +# oracle jdk 8 +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:webupd8team/java +RUN apt-get update + +# to accept license agreement automatically +RUN echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections +RUN echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections +RUN apt-get install -y oracle-java8-installer + +# java env setup +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV PATH $PATH:$JAVA_HOME/bin + +# set permissions for bootstrap file +ADD scripts/bootstrap.sh /etc/ubuntu-bootstrap.sh +RUN chown root:root /etc/ubuntu-bootstrap.sh +RUN chmod 700 /etc/ubuntu-bootstrap.sh + +EXPOSE 22 + +ENTRYPOINT ["/etc/ubuntu-bootstrap.sh", "-bash"] diff --git a/tools/hive/ubuntu/base.env b/tools/hive/ubuntu/base.env new file mode 100644 index 00000000..bf704a09 --- /dev/null +++ b/tools/hive/ubuntu/base.env @@ -0,0 +1,19 @@ +JAVA_HOME=/usr/lib/jvm/java-8-oracle +HIVE_HOME=/usr/local/hive +HIVE_CONF=/usr/local/hive/conf +HIVE_VERSION=1.1.0 +HADOOP_VERSION=2.6.0 +CDH_VERSION=5 +CDH_EXACT_VERSION=5.11.0 +POSTGRES_VERSION=9.5 +POSTGRESQL_MAIN=/var/lib/postgresql/9.5/main/ +POSTGRESQL_CONFIG_FILE=/var/lib/postgresql/9.5/main/postgresql.conf +POSTGRESQL_BIN=/usr/lib/postgresql/9.5/bin/postgres +PGPASSWORD=hive +HADOOP_HOME=/usr/local/hadoop +HADOOP_PREFIX=/usr/local/hadoop +HADOOP_COMMON_HOME=/usr/local/hadoop +HADOOP_HDFS_HOME=/usr/local/hadoop +HADOOP_MAPRED_HOME=/usr/local/hadoop +HADOOP_YARN_HOME=/usr/local/hadoop +HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop \ No newline at end of file diff --git a/tools/hive/ubuntu/conf/ssh_config b/tools/hive/ubuntu/conf/ssh_config new file mode 100644 index 00000000..535f9d32 --- /dev/null +++ b/tools/hive/ubuntu/conf/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/tools/hive/ubuntu/scripts/bootstrap.sh b/tools/hive/ubuntu/scripts/bootstrap.sh new file mode 100644 index 00000000..725b8079 --- /dev/null +++ b/tools/hive/ubuntu/scripts/bootstrap.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +/usr/sbin/sshd +service apache2 restart + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi + +if [[ $1 == "-d" ]]; then + while true; do sleep 1000; done +fi From edb729f98410fcd009dc7b5ac54371922a80f69b Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Tue, 14 Aug 2018 16:00:21 -0700 Subject: [PATCH 02/15] use CMD instead of ENTRYPOINT --- tools/hive/README.md | 7 ++++++- tools/hive/hive_pg/Dockerfile | 2 +- tools/hive/kerberos/Dockerfile | 2 +- tools/hive/ubuntu/Dockerfile | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/hive/README.md b/tools/hive/README.md index 88d37861..e98905f4 100644 --- a/tools/hive/README.md +++ b/tools/hive/README.md @@ -39,7 +39,12 @@ cd ../kerberos docker build -t cdh5-hive-kerberos . ``` -## Run the Kerberos image in DC/OS +## Run the Hive image interactively +``` +docker run -it cdh5-hive:latest /etc/hive-bootstrap.sh -bash +``` + +## Run the Kerberized Hive image in DC/OS First, deploy a KDC via the dcos-commons kdc.py utility. See [the kdc README](https://github.com/mesosphere/dcos-commons/tree/master/tools/kdc) for details. From the dcos-commons repo: diff --git a/tools/hive/hive_pg/Dockerfile b/tools/hive/hive_pg/Dockerfile index dc506780..1911d77d 100644 --- a/tools/hive/hive_pg/Dockerfile +++ b/tools/hive/hive_pg/Dockerfile @@ -75,4 +75,4 @@ RUN chmod 700 /etc/hive-bootstrap.sh EXPOSE 10000 10001 10002 10003 9083 50111 5432 # run bootstrap script -ENTRYPOINT ["/etc/hive-bootstrap.sh", "-d"] +CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/kerberos/Dockerfile b/tools/hive/kerberos/Dockerfile index fc2bebd4..87f40c2c 100644 --- a/tools/hive/kerberos/Dockerfile +++ b/tools/hive/kerberos/Dockerfile @@ -17,4 +17,4 @@ ADD conf/krb5.conf /etc/ RUN apt-get install -y krb5-user # run bootstrap script -ENTRYPOINT ["/etc/hive-bootstrap.sh", "-d"] +CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/ubuntu/Dockerfile b/tools/hive/ubuntu/Dockerfile index 031acca7..5e72c21e 100644 --- a/tools/hive/ubuntu/Dockerfile +++ b/tools/hive/ubuntu/Dockerfile @@ -57,4 +57,4 @@ RUN chmod 700 /etc/ubuntu-bootstrap.sh EXPOSE 22 -ENTRYPOINT ["/etc/ubuntu-bootstrap.sh", "-bash"] +CMD ["/etc/ubuntu-bootstrap.sh", "-bash"] From fdb6c451133ac8c5a5254d4fc307e9457da7710e Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Thu, 16 Aug 2018 10:41:30 -0700 Subject: [PATCH 03/15] Removed the apache2 configuration, but left the package because it seems to be a prerequisite for installing the software-properties-common package. Removed ubuntu bootstrap script. --- tools/hive/kerberos/Dockerfile | 2 +- tools/hive/ubuntu/Dockerfile | 9 --------- tools/hive/ubuntu/scripts/bootstrap.sh | 12 ------------ 3 files changed, 1 insertion(+), 22 deletions(-) delete mode 100644 tools/hive/ubuntu/scripts/bootstrap.sh diff --git a/tools/hive/kerberos/Dockerfile b/tools/hive/kerberos/Dockerfile index 87f40c2c..a79b30cd 100644 --- a/tools/hive/kerberos/Dockerfile +++ b/tools/hive/kerberos/Dockerfile @@ -16,5 +16,5 @@ ADD conf/krb5.conf /etc/ # install kinit, used in bootstrap script RUN apt-get install -y krb5-user -# run bootstrap script +# run bootstrap script which starts hadoop and hive servers CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/ubuntu/Dockerfile b/tools/hive/ubuntu/Dockerfile index 5e72c21e..8a3ebf36 100644 --- a/tools/hive/ubuntu/Dockerfile +++ b/tools/hive/ubuntu/Dockerfile @@ -16,8 +16,6 @@ RUN apt-get install -y curl wget tar openssh-server openssh-client rsync python- RUN mkdir /var/run/sshd RUN echo 'root:secretpasswd' | chpasswd RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN echo "ServerName localhost" >> /etc/apache2/apache2.conf -RUN sed -i 's/Listen 80/Listen 9999/g' /etc/apache2/ports.conf # passwordless ssh RUN yes | ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key @@ -50,11 +48,4 @@ RUN apt-get install -y oracle-java8-installer ENV JAVA_HOME /usr/lib/jvm/java-8-oracle ENV PATH $PATH:$JAVA_HOME/bin -# set permissions for bootstrap file -ADD scripts/bootstrap.sh /etc/ubuntu-bootstrap.sh -RUN chown root:root /etc/ubuntu-bootstrap.sh -RUN chmod 700 /etc/ubuntu-bootstrap.sh - EXPOSE 22 - -CMD ["/etc/ubuntu-bootstrap.sh", "-bash"] diff --git a/tools/hive/ubuntu/scripts/bootstrap.sh b/tools/hive/ubuntu/scripts/bootstrap.sh deleted file mode 100644 index 725b8079..00000000 --- a/tools/hive/ubuntu/scripts/bootstrap.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -/usr/sbin/sshd -service apache2 restart - -if [[ $1 == "-bash" ]]; then - /bin/bash -fi - -if [[ $1 == "-d" ]]; then - while true; do sleep 1000; done -fi From 85cd11b31bcffe0c70bffdf71e3235cfe146dada Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Thu, 16 Aug 2018 11:58:44 -0700 Subject: [PATCH 04/15] Switched to ubuntu 16.04 --- tools/hive/ubuntu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hive/ubuntu/Dockerfile b/tools/hive/ubuntu/Dockerfile index 8a3ebf36..86ccc446 100644 --- a/tools/hive/ubuntu/Dockerfile +++ b/tools/hive/ubuntu/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:trusty +FROM ubuntu:16.04 USER root From 2585dffd7d8a74454051e5e5c683de34620a9226 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Thu, 16 Aug 2018 12:00:28 -0700 Subject: [PATCH 05/15] first pass combining ubuntu and hadoop images --- tools/hive/single-image/Dockerfile | 98 +++++++++++++++++++ tools/hive/single-image/conf/mapred-site.xml | 6 ++ tools/hive/single-image/conf/ssh_config | 5 + tools/hive/single-image/scripts/bootstrap.sh | 24 +++++ .../templates/core-site.xml.template | 6 ++ .../templates/hdfs-site.xml.template | 6 ++ .../templates/yarn-site.xml.template | 64 ++++++++++++ 7 files changed, 209 insertions(+) create mode 100644 tools/hive/single-image/Dockerfile create mode 100644 tools/hive/single-image/conf/mapred-site.xml create mode 100644 tools/hive/single-image/conf/ssh_config create mode 100755 tools/hive/single-image/scripts/bootstrap.sh create mode 100644 tools/hive/single-image/templates/core-site.xml.template create mode 100644 tools/hive/single-image/templates/hdfs-site.xml.template create mode 100644 tools/hive/single-image/templates/yarn-site.xml.template diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/single-image/Dockerfile new file mode 100644 index 00000000..7d07f97d --- /dev/null +++ b/tools/hive/single-image/Dockerfile @@ -0,0 +1,98 @@ +FROM ubuntu:16.04 + +USER root + +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV HADOOP_VERSION 2.6.0 +ENV CDH_VERSION 5 +ENV CDH_EXACT_VERSION 5.11.0 +ENV HADOOP_HOME /usr/local/hadoop +ENV HADOOP_PREFIX /usr/local/hadoop +ENV HADOOP_COMMON_HOME /usr/local/hadoop +ENV HADOOP_HDFS_HOME /usr/local/hadoop +ENV HADOOP_MAPRED_HOME /usr/local/hadoop +ENV HADOOP_YARN_HOME /usr/local/hadoop +ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop +ENV HIVE_HOME /usr/local/hive + +# install dev tools +RUN apt-get update +RUN apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 + +# for running sshd in ubuntu trusty. https://github.com/docker/docker/issues/5704 +RUN mkdir /var/run/sshd +RUN echo 'root:secretpasswd' | chpasswd +RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config + +# passwordless ssh +RUN yes | ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN yes | ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN yes | ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +# fix the 254 error code +RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config +RUN echo "UsePAM no" >> /etc/ssh/sshd_config +RUN echo "Port 2122" >> /etc/ssh/sshd_config +RUN /usr/sbin/sshd + +# ssh client config +ADD conf/ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +EXPOSE 22 + +# oracle jdk 8 +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:webupd8team/java +RUN apt-get update + +# to accept license agreement automatically +RUN echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections +RUN echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections +RUN apt-get install -y oracle-java8-installer + +# java env setup +ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV PATH $PATH:$JAVA_HOME/bin + +ADD ./deps/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local +RUN cd /usr/local && ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop + +RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +# copy hadoop site xml files +RUN mkdir $HADOOP_PREFIX/input +RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input + +# pseudo distributed configurations +ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template +ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.template +ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml +ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template + +# format namenode +RUN $HADOOP_PREFIX/bin/hdfs namenode -format + +# fixing the libhadoop.so +RUN rm -rf /usr/local/hadoop/lib/native/* +ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ + +# add and set permissions for bootstrap script +ADD scripts/bootstrap.sh /etc/hadoop-bootstrap.sh +RUN chown root:root /etc/hadoop-bootstrap.sh +RUN chmod 700 /etc/hadoop-bootstrap.sh + +RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh + +# add hadoop to path +ENV PATH $PATH:$HADOOP_HOME:$HADOOP_HOME/bin + +#for exposed ports refer +#https://www.cloudera.com/documentation/enterprise/5-4-x/topics/cdh_ig_ports_cdh5.html +EXPOSE 50010 50020 50070 50075 50090 8020 9000 10020 19888 8030 8031 8032 8033 8040 8042 8088 + +# add hive to path +ENV PATH $PATH:$HIVE_HOME/bin diff --git a/tools/hive/single-image/conf/mapred-site.xml b/tools/hive/single-image/conf/mapred-site.xml new file mode 100644 index 00000000..dba582f1 --- /dev/null +++ b/tools/hive/single-image/conf/mapred-site.xml @@ -0,0 +1,6 @@ + + + mapreduce.framework.name + yarn + + diff --git a/tools/hive/single-image/conf/ssh_config b/tools/hive/single-image/conf/ssh_config new file mode 100644 index 00000000..535f9d32 --- /dev/null +++ b/tools/hive/single-image/conf/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/tools/hive/single-image/scripts/bootstrap.sh b/tools/hive/single-image/scripts/bootstrap.sh new file mode 100755 index 00000000..7df86ffe --- /dev/null +++ b/tools/hive/single-image/scripts/bootstrap.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -x +/usr/sbin/sshd +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +# templating of config files +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml + + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/tools/hive/single-image/templates/core-site.xml.template b/tools/hive/single-image/templates/core-site.xml.template new file mode 100644 index 00000000..3576bbd5 --- /dev/null +++ b/tools/hive/single-image/templates/core-site.xml.template @@ -0,0 +1,6 @@ + + + fs.defaultFS + hdfs://HOSTNAME:9000 + + diff --git a/tools/hive/single-image/templates/hdfs-site.xml.template b/tools/hive/single-image/templates/hdfs-site.xml.template new file mode 100644 index 00000000..82c525ea --- /dev/null +++ b/tools/hive/single-image/templates/hdfs-site.xml.template @@ -0,0 +1,6 @@ + + + dfs.replication + 1 + + diff --git a/tools/hive/single-image/templates/yarn-site.xml.template b/tools/hive/single-image/templates/yarn-site.xml.template new file mode 100644 index 00000000..37c1850a --- /dev/null +++ b/tools/hive/single-image/templates/yarn-site.xml.template @@ -0,0 +1,64 @@ + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + + + Number of seconds after an application finishes before the nodemanager's + DeletionService will delete the application's localized file directory + and log directory. + + To diagnose Yarn application problems, set this property's value large + enough (for example, to 600 = 10 minutes) to permit examination of these + directories. After changing the property's value, you must restart the + nodemanager in order for it to have an effect. + + The roots of Yarn applications' work directories is configurable with + the yarn.nodemanager.local-dirs property (see below), and the roots + of the Yarn applications' log directories is configurable with the + yarn.nodemanager.log-dirs property (see also below). + + yarn.nodemanager.delete.debug-delay-sec + 600 + + + + Indicate to clients whether Timeline service is enabled or not. + If enabled, the TimelineClient library used by end-users will post entities + and events to the Timeline server. + yarn.timeline-service.enabled + true + + + + The hostname of the Timeline service web application. + yarn.timeline-service.hostname + 0.0.0.0 + + + + Enables cross-origin support (CORS) for web services where + cross-origin web response headers are needed. For example, javascript making + a web services request to the timeline server. + yarn.timeline-service.http-cross-origin.enabled + true + + + + yarn.resourcemanager.webapp.cross-origin.enabled + true + + + + Publish YARN information to Timeline Server + yarn.resourcemanager.system-metrics-publisher.enabled + true + + From 81719af872a6f577bf104177ae33e8264b4ad77d Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Thu, 16 Aug 2018 14:25:08 -0700 Subject: [PATCH 06/15] removed unused code in Hadoop Dockerfile / setup --- tools/hive/single-image/Dockerfile | 15 ++++++--------- tools/hive/single-image/scripts/bootstrap.sh | 9 ++++++++- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/single-image/Dockerfile index 7d07f97d..99e8e60b 100644 --- a/tools/hive/single-image/Dockerfile +++ b/tools/hive/single-image/Dockerfile @@ -60,12 +60,12 @@ ENV PATH $PATH:$JAVA_HOME/bin ADD ./deps/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local RUN cd /usr/local && ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop -RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh -RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh +RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh +#RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh # copy hadoop site xml files -RUN mkdir $HADOOP_PREFIX/input -RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input +#RUN mkdir $HADOOP_PREFIX/input +#RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input # pseudo distributed configurations ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template @@ -73,12 +73,9 @@ ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.tem ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template -# format namenode -RUN $HADOOP_PREFIX/bin/hdfs namenode -format - # fixing the libhadoop.so -RUN rm -rf /usr/local/hadoop/lib/native/* -ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ +#RUN rm -rf /usr/local/hadoop/lib/native/* +#ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ # add and set permissions for bootstrap script ADD scripts/bootstrap.sh /etc/hadoop-bootstrap.sh diff --git a/tools/hive/single-image/scripts/bootstrap.sh b/tools/hive/single-image/scripts/bootstrap.sh index 7df86ffe..6b28b83a 100755 --- a/tools/hive/single-image/scripts/bootstrap.sh +++ b/tools/hive/single-image/scripts/bootstrap.sh @@ -1,6 +1,9 @@ #!/bin/bash set -x + +# used by Hadoop /usr/sbin/sshd + : ${HADOOP_PREFIX:=/usr/local/hadoop} $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh @@ -8,14 +11,18 @@ $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh rm /tmp/*.pid # installing libraries if any - (resource urls added comma separated to the ACP system variable) -cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - +#cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - # templating of config files sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml +# format namenode +$HADOOP_PREFIX/bin/hdfs namenode -format +# start hdfs and yarn +echo $JAVA_HOME $HADOOP_PREFIX/sbin/start-dfs.sh $HADOOP_PREFIX/sbin/start-yarn.sh From d2839f90c91e1ca6a98c5c531626d65394fd855e Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Thu, 16 Aug 2018 17:04:53 -0700 Subject: [PATCH 07/15] combined hive into the single image --- tools/hive/download_deps.sh | 21 - tools/hive/single-image/Dockerfile | 71 +- .../single-image/conf/hive-log4j.properties | 88 +++ tools/hive/single-image/conf/log4j.properties | 6 + tools/hive/single-image/conf/postgresql.conf | 630 ++++++++++++++++++ .../{bootstrap.sh => hadoop-bootstrap.sh} | 0 .../single-image/scripts/hive-bootstrap.sh | 49 ++ .../templates/hive-site.xml.template | 154 +++++ 8 files changed, 995 insertions(+), 24 deletions(-) delete mode 100755 tools/hive/download_deps.sh create mode 100644 tools/hive/single-image/conf/hive-log4j.properties create mode 100755 tools/hive/single-image/conf/log4j.properties create mode 100644 tools/hive/single-image/conf/postgresql.conf rename tools/hive/single-image/scripts/{bootstrap.sh => hadoop-bootstrap.sh} (100%) create mode 100644 tools/hive/single-image/scripts/hive-bootstrap.sh create mode 100755 tools/hive/single-image/templates/hive-site.xml.template diff --git a/tools/hive/download_deps.sh b/tools/hive/download_deps.sh deleted file mode 100755 index 7a8e2c0d..00000000 --- a/tools/hive/download_deps.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -set -x -set -e -source ubuntu/base.env - -mkdir hadoop-2.6.0/deps -mkdir hive_pg/deps - -#download cdh -echo "wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz | tar -xz -C /usr/local/" -wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz -mv hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz hadoop-2.6.0/deps/ - -#download native hadoop libs -echo "wget http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-2.6.0.tar" -wget http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-2.6.0.tar -mv hadoop-native-64-2.6.0.tar hadoop-2.6.0/deps/ - -echo "wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz" -wget http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz -mv hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz hive_pg/deps/ diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/single-image/Dockerfile index 99e8e60b..9b354ac5 100644 --- a/tools/hive/single-image/Dockerfile +++ b/tools/hive/single-image/Dockerfile @@ -14,6 +14,13 @@ ENV HADOOP_MAPRED_HOME /usr/local/hadoop ENV HADOOP_YARN_HOME /usr/local/hadoop ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop ENV HIVE_HOME /usr/local/hive +ENV HIVE_CONF /usr/local/hive/conf +ENV HIVE_VERSION 1.1.0 +ENV POSTGRES_VERSION 9.5 +ENV POSTGRESQL_MAIN /var/lib/postgresql/9.5/main/ +ENV POSTGRESQL_CONFIG_FILE /var/lib/postgresql/9.5/main/postgresql.conf +ENV POSTGRESQL_BIN /usr/lib/postgresql/9.5/bin/postgres +ENV PGPASSWORD hive # install dev tools RUN apt-get update @@ -57,8 +64,11 @@ RUN apt-get install -y oracle-java8-installer ENV JAVA_HOME /usr/lib/jvm/java-8-oracle ENV PATH $PATH:$JAVA_HOME/bin -ADD ./deps/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local -RUN cd /usr/local && ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop +# download cdh hadoop +RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz \ + | tar -xzC /usr/local && \ + cd /usr/local && \ + ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh #RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh @@ -78,7 +88,7 @@ ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.tem #ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ # add and set permissions for bootstrap script -ADD scripts/bootstrap.sh /etc/hadoop-bootstrap.sh +ADD scripts/hadoop-bootstrap.sh /etc/hadoop-bootstrap.sh RUN chown root:root /etc/hadoop-bootstrap.sh RUN chmod 700 /etc/hadoop-bootstrap.sh @@ -91,5 +101,60 @@ ENV PATH $PATH:$HADOOP_HOME:$HADOOP_HOME/bin #https://www.cloudera.com/documentation/enterprise/5-4-x/topics/cdh_ig_ports_cdh5.html EXPOSE 50010 50020 50070 50075 50090 8020 9000 10020 19888 8030 8031 8032 8033 8040 8042 8088 +# download cdh hive +RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz \ + | tar -xzC /usr/local && \ + cd /usr/local && \ + mv hive-1.1.0-cdh${CDH_EXACT_VERSION} hive + # add hive to path ENV PATH $PATH:$HIVE_HOME/bin + +# add postgresql jdbc jar to classpath +RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar + +# to configure postgres as hive metastore backend +RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" >> /etc/apt/sources.list.d/pgdg.list' +RUN wget -q https://www.postgresql.org/media/keys/ACCC4CF8.asc -O - | apt-key add - +RUN apt-get update -y +RUN apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java + +USER postgres +# initialize hive metastore db +# create metastore db, hive user and assign privileges +RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ &&\ + /etc/init.d/postgresql start &&\ + psql --command "CREATE DATABASE metastore;" &&\ + psql --command "CREATE USER hive WITH PASSWORD 'hive';" && \ + psql --command "ALTER USER hive WITH SUPERUSER;" && \ + psql --command "GRANT ALL PRIVILEGES ON DATABASE metastore TO hive;" && \ + psql -U hive -d metastore -h localhost -f hive-schema-${HIVE_VERSION}.postgres.sql + + +# revert back to default user +USER root + +# disable ssl in postgres.conf +ADD conf/postgresql.conf $POSTGRESQL_MAIN +RUN echo $POSTGRESQL_MAIN +RUN echo $POSTGRESQL_CONFIG_FILE +RUN chown postgres:postgres $POSTGRESQL_CONFIG_FILE +RUN sed -i -e 's/peer/md5/g' /etc/postgresql/$POSTGRES_VERSION/main/pg_hba.conf + +# copy config, sql, data files to /opt/files +RUN mkdir /opt/files +RUN echo $HIVE_CONF +ADD templates/hive-site.xml.template /opt/files/ +ADD conf/hive-log4j.properties /opt/files/ +ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template +ADD conf/hive-log4j.properties $HIVE_CONF/hive-log4j.properties + +# set permissions for hive bootstrap file +ADD scripts/hive-bootstrap.sh /etc/hive-bootstrap.sh +RUN chown root:root /etc/hive-bootstrap.sh +RUN chmod 700 /etc/hive-bootstrap.sh + +EXPOSE 10000 10001 10002 10003 9083 50111 5432 + +# run bootstrap script +CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/single-image/conf/hive-log4j.properties b/tools/hive/single-image/conf/hive-log4j.properties new file mode 100644 index 00000000..b258a503 --- /dev/null +++ b/tools/hive/single-image/conf/hive-log4j.properties @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hive.log.threshold=ALL +hive.root.logger=INFO,DRFA +hive.log.dir=/tmp/logs/ +hive.log.file=hive.log + +# Define the root logger to the system property "hadoop.root.logger". +log4j.rootLogger=${hive.root.logger}, EventCounter + +# Logging Threshold +log4j.threshold=${hive.log.threshold} + +# +# Daily Rolling File Appender +# +# Use the PidDailyerRollingFileAppend class instead if you want to use separate log files +# for different CLI session. +# +# log4j.appender.DRFA=org.apache.hadoop.hive.ql.log.PidDailyRollingFileAppender + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender + +log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file} + +# Rollver at midnight +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n +log4j.appender.console.encoding=UTF-8 + +#custom logging levels +#log4j.logger.xxx=DEBUG + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.hive.shims.HiveEventCounter + + +log4j.category.DataNucleus=ERROR,DRFA +log4j.category.Datastore=ERROR,DRFA +log4j.category.Datastore.Schema=ERROR,DRFA +log4j.category.JPOX.Datastore=ERROR,DRFA +log4j.category.JPOX.Plugin=ERROR,DRFA +log4j.category.JPOX.MetaData=ERROR,DRFA +log4j.category.JPOX.Query=ERROR,DRFA +log4j.category.JPOX.General=ERROR,DRFA +log4j.category.JPOX.Enhancer=ERROR,DRFA + + +# Silence useless ZK logs +log4j.logger.org.apache.zookeeper.server.NIOServerCnxn=WARN,DRFA +log4j.logger.org.apache.zookeeper.ClientCnxnSocketNIO=WARN,DRFA diff --git a/tools/hive/single-image/conf/log4j.properties b/tools/hive/single-image/conf/log4j.properties new file mode 100755 index 00000000..bd066b0b --- /dev/null +++ b/tools/hive/single-image/conf/log4j.properties @@ -0,0 +1,6 @@ +log4j.rootLogger=${hive.root.logger} +hive.root.logger=INFO,console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/tools/hive/single-image/conf/postgresql.conf b/tools/hive/single-image/conf/postgresql.conf new file mode 100644 index 00000000..38d855ed --- /dev/null +++ b/tools/hive/single-image/conf/postgresql.conf @@ -0,0 +1,630 @@ +# ----------------------------- +# PostgreSQL configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# (The "=" is optional.) Whitespace may be used. Comments are introduced with +# "#" anywhere on a line. The complete list of parameter names and allowed +# values can be found in the PostgreSQL documentation. +# +# The commented-out settings shown in this file represent the default values. +# Re-commenting a setting is NOT sufficient to revert it to the default value; +# you need to reload the server. +# +# This file is read on server startup and when the server receives a SIGHUP +# signal. If you edit the file on a running system, you have to SIGHUP the +# server for the changes to take effect, or use "pg_ctl reload". Some +# parameters, which are marked below, require a server shutdown and restart to +# take effect. +# +# Any parameter can also be given as a command-line option to the server, e.g., +# "postgres -c log_connections=on". Some parameters can be changed at run time +# with the "SET" SQL command. +# +# Memory units: kB = kilobytes Time units: ms = milliseconds +# MB = megabytes s = seconds +# GB = gigabytes min = minutes +# TB = terabytes h = hours +# d = days + + +#------------------------------------------------------------------------------ +# FILE LOCATIONS +#------------------------------------------------------------------------------ + +# The default values of these variables are driven from the -D command-line +# option or PGDATA environment variable, represented here as ConfigDir. + +data_directory = '/var/lib/postgresql/9.5/main' # use data in another directory +# (change requires restart) +hba_file = '/var/lib/postgresql/9.5/pg_hba.conf' # host-based authentication file +# (change requires restart) +ident_file = '/var/lib/postgresql/9.5/pg_ident.conf' # ident configuration file +# (change requires restart) + +# If external_pid_file is not explicitly set, no extra PID file is written. +#external_pid_file = '' # write an extra PID file +# (change requires restart) + + +#------------------------------------------------------------------------------ +# CONNECTIONS AND AUTHENTICATION +#------------------------------------------------------------------------------ + +# - Connection Settings - + +listen_addresses = '*' # what IP address(es) to listen on; +# comma-separated list of addresses; +# defaults to 'localhost'; use '*' for all +# (change requires restart) +port = 5432 # (change requires restart) +max_connections = 100 # (change requires restart) +# Note: Increasing max_connections costs ~400 bytes of shared memory per +# connection slot, plus lock space (see max_locks_per_transaction). +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories +# (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation +# (change requires restart) +#bonjour = off # advertise server via Bonjour +# (change requires restart) +#bonjour_name = '' # defaults to the computer name +# (change requires restart) + +# - Security and Authentication - + +#authentication_timeout = 1min # 1s-600s +ssl = off # (change requires restart) +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers +# (change requires restart) +#ssl_prefer_server_ciphers = on # (change requires restart) +#ssl_ecdh_curve = 'prime256v1' # (change requires restart) +#ssl_cert_file = 'server.crt' # (change requires restart) +#ssl_key_file = 'server.key' # (change requires restart) +#ssl_ca_file = '' # (change requires restart) +#ssl_crl_file = '' # (change requires restart) +#password_encryption = on +#db_user_namespace = off +#row_security = on + +# GSSAPI using Kerberos +#krb_server_keyfile = '' +#krb_caseins_users = off + +# - TCP Keepalives - +# see "man 7 tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; +# 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; +# 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; +# 0 selects the system default + + +#------------------------------------------------------------------------------ +# RESOURCE USAGE (except WAL) +#------------------------------------------------------------------------------ + +# - Memory - + +shared_buffers = 128MB # min 128kB +# (change requires restart) +#huge_pages = try # on, off, or try +# (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature +# (change requires restart) +# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory +# per transaction slot, plus lock space (see max_locks_per_transaction). +# It is not advisable to set max_prepared_transactions nonzero unless you +# actively intend to use prepared transactions. +#work_mem = 4MB # min 64kB +#maintenance_work_mem = 64MB # min 1MB +#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem +#max_stack_depth = 2MB # min 100kB +#dynamic_shared_memory_type = posix # the default is the first option +# supported by the operating system: +# posix +# sysv +# windows +# mmap +# use none to disable dynamic shared memory + +# - Disk - + +#temp_file_limit = -1 # limits per-session temp file space +# in kB, or -1 for no limit + +# - Kernel Resource Usage - + +#max_files_per_process = 1000 # min 25 +# (change requires restart) +#shared_preload_libraries = '' # (change requires restart) + +# - Cost-Based Vacuum Delay - + +#vacuum_cost_delay = 0 # 0-100 milliseconds +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 10 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits + +# - Background Writer - + +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round + +# - Asynchronous Behavior - + +#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +#max_worker_processes = 8 + + +#------------------------------------------------------------------------------ +# WRITE AHEAD LOG +#------------------------------------------------------------------------------ + +# - Settings - + +#wal_level = minimal # minimal, archive, hot_standby, or logical +# (change requires restart) +#fsync = on # turns forced synchronization on or off +#synchronous_commit = on # synchronization level; +# off, local, remote_write, or on +#wal_sync_method = fsync # the default is the first option +# supported by the operating system: +# open_datasync +# fdatasync (default on Linux) +# fsync +# fsync_writethrough +# open_sync +#full_page_writes = on # recover from partial page writes +#wal_compression = off # enable compression of full-page writes +#wal_log_hints = off # also do full page writes of non-critical updates +# (change requires restart) +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers +# (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds + +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 1-1000 + +# - Checkpoints - + +#checkpoint_timeout = 5min # range 30s-1h +#max_wal_size = 1GB +#min_wal_size = 80MB +#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_warning = 30s # 0 disables + +# - Archiving - + +#archive_mode = off # enables archiving; off, on, or always +# (change requires restart) +#archive_command = '' # command to use to archive a logfile segment +# placeholders: %p = path of file to archive +# %f = file name only +# e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_timeout = 0 # force a logfile segment switch after this +# number of seconds; 0 disables + + +#------------------------------------------------------------------------------ +# REPLICATION +#------------------------------------------------------------------------------ + +# - Sending Server(s) - + +# Set these on the master and on any standby that will send replication data. + +#max_wal_senders = 0 # max number of walsender processes +# (change requires restart) +#wal_keep_segments = 0 # in logfile segments, 16MB each; 0 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables + +#max_replication_slots = 0 # max number of replication slots +# (change requires restart) +#track_commit_timestamp = off # collect timestamp of transaction commit +# (change requires restart) + +# - Master Server - + +# These settings are ignored on a standby server. + +#synchronous_standby_names = '' # standby servers that provide sync rep +# comma-separated list of application_name +# from standby(s); '*' = all +#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed + +# - Standby Servers - + +# These settings are ignored on a master server. + +#hot_standby = off # "on" allows queries during recovery +# (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries +# when reading WAL from archive; +# -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries +# when reading streaming WAL; +# -1 allows indefinite delay +#wal_receiver_status_interval = 10s # send replies at least this often +# 0 disables +#hot_standby_feedback = off # send info from standby to prevent +# query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for +# communication from master +# in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to +# retrieve WAL after a failed attempt + + +#------------------------------------------------------------------------------ +# QUERY TUNING +#------------------------------------------------------------------------------ + +# - Planner Method Configuration - + +#enable_bitmapscan = on +#enable_hashagg = on +#enable_hashjoin = on +#enable_indexscan = on +#enable_indexonlyscan = on +#enable_material = on +#enable_mergejoin = on +#enable_nestloop = on +#enable_seqscan = on +#enable_sort = on +#enable_tidscan = on + +# - Planner Cost Constants - + +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#effective_cache_size = 4GB + +# - Genetic Query Optimizer - + +#geqo = on +#geqo_threshold = 12 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 + +# - Other Planner Options - + +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#from_collapse_limit = 8 +#join_collapse_limit = 8 # 1 disables collapsing of explicit +# JOIN clauses + + +#------------------------------------------------------------------------------ +# ERROR REPORTING AND LOGGING +#------------------------------------------------------------------------------ + +# - Where to Log - + +log_destination = 'stderr' # Valid values are combinations of +# stderr, csvlog, syslog, and eventlog, +# depending on platform. csvlog +# requires logging_collector to be on. + +# This is used when logging to stderr: +logging_collector = on # Enable capturing of stderr and csvlog +# into log files. Required to be on for +# csvlogs. +# (change requires restart) + +# These are only used if logging_collector is on: +log_directory = 'pg_log' # directory where log files are written, +# can be absolute or relative to PGDATA +log_filename = 'postgresql-%a.log' # log file name pattern, +# can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, +# begin with 0 to use octal notation +log_truncate_on_rotation = on # If on, an existing log file with the +# same name as the new log file will be +# truncated rather than appended to. +# But such truncation only occurs on +# time-driven rotation, not on restarts +# or size-driven rotation. Default is +# off, meaning append to existing files +# in all cases. +log_rotation_age = 1d # Automatic rotation of logfiles will +# happen after that time. 0 disables. +log_rotation_size = 0 # Automatic rotation of logfiles will +# happen after that much log output. +# 0 disables. + +# These are relevant when logging to syslog: +#syslog_facility = 'LOCAL0' +#syslog_ident = 'postgres' + +# This is only relevant when logging to eventlog (win32): +#event_source = 'PostgreSQL' + +# - When to Log - + +#client_min_messages = notice # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# log +# notice +# warning +# error + +#log_min_messages = warning # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# info +# notice +# warning +# error +# log +# fatal +# panic + +#log_min_error_statement = error # values in order of decreasing detail: +# debug5 +# debug4 +# debug3 +# debug2 +# debug1 +# info +# notice +# warning +# error +# log +# fatal +# panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements +# and their durations, > 0 logs only +# statements running at least this number +# of milliseconds + + +# - What to Log - + +#debug_print_parse = off +#debug_print_rewritten = off +#debug_print_plan = off +#debug_pretty_print = on +#log_checkpoints = off +#log_connections = off +#log_disconnections = off +#log_duration = off +#log_error_verbosity = default # terse, default, or verbose messages +#log_hostname = off +log_line_prefix = '< %m >' # special values: +# %a = application name +# %u = user name +# %d = database name +# %r = remote host and port +# %h = remote host +# %p = process ID +# %t = timestamp without milliseconds +# %m = timestamp with milliseconds +# %i = command tag +# %e = SQL state +# %c = session ID +# %l = session line number +# %s = session start timestamp +# %v = virtual transaction ID +# %x = transaction ID (0 if none) +# %q = stop here in non-session +# processes +# %% = '%' +# e.g. '<%u%%%d> ' +#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_statement = 'none' # none, ddl, mod, all +#log_replication_commands = off +#log_temp_files = -1 # log temporary files equal or larger +# than the specified size in kilobytes; +# -1 disables, 0 logs all temp files +#log_timezone = 'GMT' + + +# - Process Title - + +#cluster_name = '' # added to process titles if nonempty +# (change requires restart) +#update_process_title = on + + +#------------------------------------------------------------------------------ +# RUNTIME STATISTICS +#------------------------------------------------------------------------------ + +# - Query/Index Statistics Collector - + +#track_activities = on +#track_counts = on +#track_io_timing = off +#track_functions = none # none, pl, all +#track_activity_query_size = 1024 # (change requires restart) +#stats_temp_directory = 'pg_stat_tmp' + + +# - Statistics Monitoring - + +#log_parser_stats = off +#log_planner_stats = off +#log_executor_stats = off +#log_statement_stats = off + + +#------------------------------------------------------------------------------ +# AUTOVACUUM PARAMETERS +#------------------------------------------------------------------------------ + +#autovacuum = on # Enable autovacuum subprocess? 'on' +# requires track_counts to also be on. +#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and +# their durations, > 0 logs only +# actions running at least this number +# of milliseconds. +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses +# (change requires restart) +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before +# vacuum +#autovacuum_analyze_threshold = 50 # min number of row updates before +# analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum +# (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age +# before forced vacuum +# (change requires restart) +#autovacuum_vacuum_cost_delay = 20ms # default vacuum cost delay for +# autovacuum, in milliseconds; +# -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for +# autovacuum, -1 means use +# vacuum_cost_limit + + +#------------------------------------------------------------------------------ +# CLIENT CONNECTION DEFAULTS +#------------------------------------------------------------------------------ + +# - Statement Behavior - + +#search_path = '"$user", public' # schema names +#default_tablespace = '' # a tablespace name, '' uses the default +#temp_tablespaces = '' # a list of tablespace names, '' uses +# only default tablespace +#check_function_bodies = on +#default_transaction_isolation = 'read committed' +#default_transaction_read_only = off +#default_transaction_deferrable = off +#session_replication_role = 'origin' +#statement_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#vacuum_freeze_min_age = 50000000 +#vacuum_freeze_table_age = 150000000 +#vacuum_multixact_freeze_min_age = 5000000 +#vacuum_multixact_freeze_table_age = 150000000 +#bytea_output = 'hex' # hex, escape +#xmlbinary = 'base64' +#xmloption = 'content' +#gin_fuzzy_search_limit = 0 +#gin_pending_list_limit = 4MB + +# - Locale and Formatting - + +#datestyle = 'iso, mdy' +#intervalstyle = 'postgres' +#timezone = 'GMT' +#timezone_abbreviations = 'Default' # Select the set of available time zone +# abbreviations. Currently, there are +# Default +# Australia (historical usage) +# India +# You can create your own file in +# share/timezonesets/. +#extra_float_digits = 0 # min -15, max 3 +#client_encoding = sql_ascii # actually, defaults to database +# encoding + +# These settings are initialized by initdb, but they can be changed. +#lc_messages = 'C' # locale for system error message +# strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting + +# default configuration for text search +#default_text_search_config = 'pg_catalog.simple' + +# - Other Defaults - + +#dynamic_library_path = '$libdir' +#local_preload_libraries = '' +#session_preload_libraries = '' + + +#------------------------------------------------------------------------------ +# LOCK MANAGEMENT +#------------------------------------------------------------------------------ + +#deadlock_timeout = 1s +#max_locks_per_transaction = 64 # min 10 +# (change requires restart) +# Note: Each lock table slot uses ~270 bytes of shared memory, and there are +# max_locks_per_transaction * (max_connections + max_prepared_transactions) +# lock table slots. +#max_pred_locks_per_transaction = 64 # min 10 +# (change requires restart) + + +#------------------------------------------------------------------------------ +# VERSION/PLATFORM COMPATIBILITY +#------------------------------------------------------------------------------ + +# - Previous PostgreSQL Versions - + +#array_nulls = on +#backslash_quote = safe_encoding # on, off, or safe_encoding +#default_with_oids = off +#escape_string_warning = on +#lo_compat_privileges = off +#operator_precedence_warning = off +#quote_all_identifiers = off +#sql_inheritance = on +#standard_conforming_strings = on +#synchronize_seqscans = on + +# - Other Platforms and Clients - + +#transform_null_equals = off + + +#------------------------------------------------------------------------------ +# ERROR HANDLING +#------------------------------------------------------------------------------ + +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? + + +#------------------------------------------------------------------------------ +# CONFIG FILE INCLUDES +#------------------------------------------------------------------------------ + +# These options allow settings to be loaded from files other than the +# default postgresql.conf. + +#include_dir = 'conf.d' # include files ending in '.conf' from +# directory 'conf.d' +#include_if_exists = 'exists.conf' # include file only if it exists +#include = 'special.conf' # include file + + +#------------------------------------------------------------------------------ +# CUSTOMIZED OPTIONS +#------------------------------------------------------------------------------ + +# Add settings for extensions here diff --git a/tools/hive/single-image/scripts/bootstrap.sh b/tools/hive/single-image/scripts/hadoop-bootstrap.sh similarity index 100% rename from tools/hive/single-image/scripts/bootstrap.sh rename to tools/hive/single-image/scripts/hadoop-bootstrap.sh diff --git a/tools/hive/single-image/scripts/hive-bootstrap.sh b/tools/hive/single-image/scripts/hive-bootstrap.sh new file mode 100644 index 00000000..1dae776a --- /dev/null +++ b/tools/hive/single-image/scripts/hive-bootstrap.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -x +#save all env vars .bashrc for ssh sessions +printenv | cat >> /root/.bashrc + +# hadoop bootstrap +/etc/hadoop-bootstrap.sh -d + +# restart postgresql +/etc/init.d/postgresql restart + +# kinit for kerberos mode +if command -v kinit 2>/dev/null; then + kinit -k -t /usr/local/hadoop/etc/hadoop/hdfs.keytab hdfs@LOCAL +fi + +until hdfs dfs -ls / +do + echo "waiting for hdfs to be ready"; sleep 10; +done + +# create hdfs directories +$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root +hdfs dfs -chown -R hdfs:supergroup /user + +$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /apps/hive/warehouse +hdfs dfs -chown -R hive:supergroup /apps/hive +hdfs dfs -chmod 777 /apps/hive/warehouse + +# altering the hive-site configuration +sed s/HOSTNAME/$HOSTNAME/ /usr/local/hive/conf/hive-site.xml.template > /usr/local/hive/conf/hive-site.xml +sed s/HOSTNAME/$HOSTNAME/ /opt/files/hive-site.xml.template > /opt/files/hive-site.xml + +# start hive metastore server +$HIVE_HOME/bin/hive --service metastore & + +sleep 20 + +# start hive server +$HIVE_HOME/bin/hive --service hiveserver2 & + + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi + +if [[ $1 == "-d" ]]; then + while true; do sleep 10000; done +fi diff --git a/tools/hive/single-image/templates/hive-site.xml.template b/tools/hive/single-image/templates/hive-site.xml.template new file mode 100755 index 00000000..d22861b4 --- /dev/null +++ b/tools/hive/single-image/templates/hive-site.xml.template @@ -0,0 +1,154 @@ + + + hive.metastore.cache.pinobjtypes + Table,Database,Type,FieldSchema,Order + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionUserName + hive + + + hive.auto.convert.join + true + + + fs.hdfs.impl.disable.cache + true + + + fs.file.impl.disable.cache + true + + + hive.metastore.warehouse.dir + /apps/hive/warehouse + + + hive.auto.convert.sortmerge.join + true + + + hive.metastore.client.socket.timeout + 60 + + + hive.optimize.bucketmapjoin + true + + + hive.optimize.bucketmapjoin.sortedmerge + true + + + hive.optimize.index.filter + true + + + hive.auto.convert.join.noconditionaltask.size + 1000000000 + + + hive.auto.convert.join.noconditionaltask + true + + + hive.mapjoin.bucket.cache.size + 10000 + + + hive.vectorized.execution.enabled + true + + + hive.security.authorization.enabled + false + + + hive.optimize.reducededuplication.min.reducer + 4 + + + hive.server2.enable.doAs + true + + + hive.mapred.reduce.tasks.speculative.execution + false + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://localhost/metastore + + + hive.enforce.bucketing + true + + + hive.metastore.execute.setugi + true + + + hive.enforce.sorting + true + + + hive.security.authorization.manager + org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider + + + hive.map.aggr + true + + + hive.optimize.reducededuplication + true + + + + hive.vectorized.execution.enabled + true + + + hive.vectorized.groupby.maxentries + 10000 + + + hive.vectorized.groupby.checkinterval + 10000 + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + + + javax.jdo.option.ConnectionPassword + hive + + + tez.am.node-blacklisting.enabled + false + + + hive.prewarm.numcontainers + 3 + + Controls the number of containers to prewarm for tez (hadoop 2 only) + + + + mapred.tez.java.opts + -Xmx256m + + + hive.tez.container.size + 256 + + + From 8ab718beb93926faaa970a54ae824d161efccf91 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Tue, 21 Aug 2018 10:40:55 -0700 Subject: [PATCH 08/15] removed unnecessary configs from yarn-site.xml and hive-site.xml --- tools/hive/kerberos/Dockerfile | 4 +- .../kerberos/marathon/hdfs-hive-kerberos.json | 2 +- .../kerberos/templates/hive-site.xml.template | 182 ++++-------------- .../kerberos/templates/yarn-site.xml.template | 64 +----- tools/hive/single-image/Dockerfile | 36 ++-- .../templates/hive-site.xml.template | 116 ----------- .../templates/yarn-site.xml.template | 62 ------ 7 files changed, 55 insertions(+), 411 deletions(-) diff --git a/tools/hive/kerberos/Dockerfile b/tools/hive/kerberos/Dockerfile index a79b30cd..ca3c0188 100644 --- a/tools/hive/kerberos/Dockerfile +++ b/tools/hive/kerberos/Dockerfile @@ -14,7 +14,9 @@ ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template ADD conf/krb5.conf /etc/ # install kinit, used in bootstrap script -RUN apt-get install -y krb5-user +RUN apt-get update && \ + apt-get install -y krb5-user && \ + rm -rf /var/lib/apt/lists/* # run bootstrap script which starts hadoop and hive servers CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json b/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json index d2963654..8cfa91ac 100644 --- a/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json +++ b/tools/hive/kerberos/marathon/hdfs-hive-kerberos.json @@ -33,7 +33,7 @@ [ "hostname", "IS", - "10.0.1.100" + "10.0.0.114" ] ] } diff --git a/tools/hive/kerberos/templates/hive-site.xml.template b/tools/hive/kerberos/templates/hive-site.xml.template index 4ad073bc..0b541140 100755 --- a/tools/hive/kerberos/templates/hive-site.xml.template +++ b/tools/hive/kerberos/templates/hive-site.xml.template @@ -1,152 +1,40 @@ - hive.metastore.cache.pinobjtypes - Table,Database,Type,FieldSchema,Order - - - javax.jdo.option.ConnectionDriverName - org.postgresql.Driver - - - javax.jdo.option.ConnectionUserName - hive - - - hive.auto.convert.join - true - - - fs.hdfs.impl.disable.cache - true - - - fs.file.impl.disable.cache - true - - - hive.metastore.warehouse.dir - /apps/hive/warehouse - - - hive.auto.convert.sortmerge.join - true - - - hive.metastore.client.socket.timeout - 60 - - - hive.optimize.bucketmapjoin - true - - - hive.optimize.bucketmapjoin.sortedmerge - true - - - hive.optimize.index.filter - true - - - hive.auto.convert.join.noconditionaltask.size - 1000000000 - - - hive.auto.convert.join.noconditionaltask - true - - - hive.mapjoin.bucket.cache.size - 10000 - - - hive.vectorized.execution.enabled - true - - - hive.security.authorization.enabled - false - - - hive.optimize.reducededuplication.min.reducer - 4 - - - hive.server2.enable.doAs - true - - - hive.mapred.reduce.tasks.speculative.execution - false - - - javax.jdo.option.ConnectionURL - jdbc:postgresql://localhost/metastore - - - hive.enforce.bucketing - true - - - hive.metastore.execute.setugi - true - - - hive.enforce.sorting - true - - - hive.map.aggr - true - - - hive.optimize.reducededuplication - true - - - - hive.vectorized.execution.enabled - true - - - hive.vectorized.groupby.maxentries - 10000 - - - hive.vectorized.groupby.checkinterval - 10000 - - - hive.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat - - - javax.jdo.option.ConnectionPassword - hive - - - tez.am.node-blacklisting.enabled - false - - - hive.prewarm.numcontainers - 3 - - Controls the number of containers to prewarm for tez (hadoop 2 only) - - - - mapred.tez.java.opts - -Xmx256m - - - hive.tez.container.size - 256 - - + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionUserName + hive + + + hive.metastore.warehouse.dir + /apps/hive/warehouse + + + hive.metastore.client.socket.timeout + 60 + + + hive.server2.enable.doAs + true + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://localhost/metastore + + + hive.metastore.execute.setugi + true + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + + + javax.jdo.option.ConnectionPassword + hive + diff --git a/tools/hive/kerberos/templates/yarn-site.xml.template b/tools/hive/kerberos/templates/yarn-site.xml.template index 3ecbd8f6..f110584f 100644 --- a/tools/hive/kerberos/templates/yarn-site.xml.template +++ b/tools/hive/kerberos/templates/yarn-site.xml.template @@ -1,72 +1,10 @@ - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.application.classpath - /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* - - - - - Number of seconds after an application finishes before the nodemanager's - DeletionService will delete the application's localized file directory - and log directory. - - To diagnose Yarn application problems, set this property's value large - enough (for example, to 600 = 10 minutes) to permit examination of these - directories. After changing the property's value, you must restart the - nodemanager in order for it to have an effect. - - The roots of Yarn applications' work directories is configurable with - the yarn.nodemanager.local-dirs property (see below), and the roots - of the Yarn applications' log directories is configurable with the - yarn.nodemanager.log-dirs property (see also below). - - yarn.nodemanager.delete.debug-delay-sec - 600 - - - - Indicate to clients whether Timeline service is enabled or not. - If enabled, the TimelineClient library used by end-users will post entities - and events to the Timeline server. - yarn.timeline-service.enabled - true - - - - The hostname of the Timeline service web application. - yarn.timeline-service.hostname - 0.0.0.0 - - - - Enables cross-origin support (CORS) for web services where - cross-origin web response headers are needed. For example, javascript making - a web services request to the timeline server. - yarn.timeline-service.http-cross-origin.enabled - true - - - - yarn.resourcemanager.webapp.cross-origin.enabled - true - - - - Publish YARN information to Timeline Server - yarn.resourcemanager.system-metrics-publisher.enabled - true - - yarn.resourcemanager.keytab /usr/local/hadoop/etc/hadoop/hdfs.keytab + yarn.resourcemanager.principal yarn/HOSTNAME@LOCAL diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/single-image/Dockerfile index 9b354ac5..dc709474 100644 --- a/tools/hive/single-image/Dockerfile +++ b/tools/hive/single-image/Dockerfile @@ -23,8 +23,9 @@ ENV POSTGRESQL_BIN /usr/lib/postgresql/9.5/bin/postgres ENV PGPASSWORD hive # install dev tools -RUN apt-get update -RUN apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 +RUN apt-get update && \ + apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 && \ + rm -rf /var/lib/apt/lists/* # for running sshd in ubuntu trusty. https://github.com/docker/docker/issues/5704 RUN mkdir /var/run/sshd @@ -51,14 +52,15 @@ RUN chown root:root /root/.ssh/config EXPOSE 22 # oracle jdk 8 -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:webupd8team/java -RUN apt-get update - -# to accept license agreement automatically -RUN echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections -RUN echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections -RUN apt-get install -y oracle-java8-installer +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:webupd8team/java && \ + apt-get update && \ + # to accept license agreement automatically + echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections && \ + echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections && \ + apt-get install -y oracle-java8-installer && \ + rm -rf /var/lib/apt/lists/* # java env setup ENV JAVA_HOME /usr/lib/jvm/java-8-oracle @@ -71,11 +73,6 @@ RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/had ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh -#RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh - -# copy hadoop site xml files -#RUN mkdir $HADOOP_PREFIX/input -#RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input # pseudo distributed configurations ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template @@ -83,10 +80,6 @@ ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.tem ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template -# fixing the libhadoop.so -#RUN rm -rf /usr/local/hadoop/lib/native/* -#ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ - # add and set permissions for bootstrap script ADD scripts/hadoop-bootstrap.sh /etc/hadoop-bootstrap.sh RUN chown root:root /etc/hadoop-bootstrap.sh @@ -116,8 +109,9 @@ RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.j # to configure postgres as hive metastore backend RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" >> /etc/apt/sources.list.d/pgdg.list' RUN wget -q https://www.postgresql.org/media/keys/ACCC4CF8.asc -O - | apt-key add - -RUN apt-get update -y -RUN apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java +RUN apt-get update -y && \ + apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java && \ + rm -rf /var/lib/apt/lists/* USER postgres # initialize hive metastore db diff --git a/tools/hive/single-image/templates/hive-site.xml.template b/tools/hive/single-image/templates/hive-site.xml.template index d22861b4..743836ce 100755 --- a/tools/hive/single-image/templates/hive-site.xml.template +++ b/tools/hive/single-image/templates/hive-site.xml.template @@ -1,8 +1,4 @@ - - hive.metastore.cache.pinobjtypes - Table,Database,Type,FieldSchema,Order - javax.jdo.option.ConnectionDriverName org.postgresql.Driver @@ -11,118 +7,26 @@ javax.jdo.option.ConnectionUserName hive - - hive.auto.convert.join - true - - - fs.hdfs.impl.disable.cache - true - - - fs.file.impl.disable.cache - true - hive.metastore.warehouse.dir /apps/hive/warehouse - - hive.auto.convert.sortmerge.join - true - hive.metastore.client.socket.timeout 60 - - hive.optimize.bucketmapjoin - true - - - hive.optimize.bucketmapjoin.sortedmerge - true - - - hive.optimize.index.filter - true - - - hive.auto.convert.join.noconditionaltask.size - 1000000000 - - - hive.auto.convert.join.noconditionaltask - true - - - hive.mapjoin.bucket.cache.size - 10000 - - - hive.vectorized.execution.enabled - true - - - hive.security.authorization.enabled - false - - - hive.optimize.reducededuplication.min.reducer - 4 - hive.server2.enable.doAs true - - hive.mapred.reduce.tasks.speculative.execution - false - javax.jdo.option.ConnectionURL jdbc:postgresql://localhost/metastore - - hive.enforce.bucketing - true - hive.metastore.execute.setugi true - - hive.enforce.sorting - true - - - hive.security.authorization.manager - org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider - - - hive.map.aggr - true - - - hive.optimize.reducededuplication - true - - - - hive.vectorized.execution.enabled - true - - - hive.vectorized.groupby.maxentries - 10000 - - - hive.vectorized.groupby.checkinterval - 10000 - hive.input.format org.apache.hadoop.hive.ql.io.HiveInputFormat @@ -131,24 +35,4 @@ javax.jdo.option.ConnectionPassword hive - - tez.am.node-blacklisting.enabled - false - - - hive.prewarm.numcontainers - 3 - - Controls the number of containers to prewarm for tez (hadoop 2 only) - - - - mapred.tez.java.opts - -Xmx256m - - - hive.tez.container.size - 256 - - diff --git a/tools/hive/single-image/templates/yarn-site.xml.template b/tools/hive/single-image/templates/yarn-site.xml.template index 37c1850a..83138436 100644 --- a/tools/hive/single-image/templates/yarn-site.xml.template +++ b/tools/hive/single-image/templates/yarn-site.xml.template @@ -1,64 +1,2 @@ - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.application.classpath - /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* - - - - - Number of seconds after an application finishes before the nodemanager's - DeletionService will delete the application's localized file directory - and log directory. - - To diagnose Yarn application problems, set this property's value large - enough (for example, to 600 = 10 minutes) to permit examination of these - directories. After changing the property's value, you must restart the - nodemanager in order for it to have an effect. - - The roots of Yarn applications' work directories is configurable with - the yarn.nodemanager.local-dirs property (see below), and the roots - of the Yarn applications' log directories is configurable with the - yarn.nodemanager.log-dirs property (see also below). - - yarn.nodemanager.delete.debug-delay-sec - 600 - - - - Indicate to clients whether Timeline service is enabled or not. - If enabled, the TimelineClient library used by end-users will post entities - and events to the Timeline server. - yarn.timeline-service.enabled - true - - - - The hostname of the Timeline service web application. - yarn.timeline-service.hostname - 0.0.0.0 - - - - Enables cross-origin support (CORS) for web services where - cross-origin web response headers are needed. For example, javascript making - a web services request to the timeline server. - yarn.timeline-service.http-cross-origin.enabled - true - - - - yarn.resourcemanager.webapp.cross-origin.enabled - true - - - - Publish YARN information to Timeline Server - yarn.resourcemanager.system-metrics-publisher.enabled - true - From 16e032df1208c54978ede59e0ea2c7dfc6aa2c13 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Tue, 21 Aug 2018 13:53:49 -0700 Subject: [PATCH 09/15] removed more hive-site.xml properties, removed log4j config files --- .../kerberos/templates/hive-site.xml.template | 55 +++++------- tools/hive/single-image/Dockerfile | 2 - .../single-image/conf/hive-log4j.properties | 88 ------------------- tools/hive/single-image/conf/log4j.properties | 6 -- tools/hive/single-image/conf/postgresql.conf | 2 +- .../templates/hive-site.xml.template | 29 ++---- 6 files changed, 28 insertions(+), 154 deletions(-) delete mode 100644 tools/hive/single-image/conf/hive-log4j.properties delete mode 100755 tools/hive/single-image/conf/log4j.properties diff --git a/tools/hive/kerberos/templates/hive-site.xml.template b/tools/hive/kerberos/templates/hive-site.xml.template index 0b541140..e2bdc064 100755 --- a/tools/hive/kerberos/templates/hive-site.xml.template +++ b/tools/hive/kerberos/templates/hive-site.xml.template @@ -1,40 +1,25 @@ + - javax.jdo.option.ConnectionDriverName - org.postgresql.Driver - - - javax.jdo.option.ConnectionUserName - hive - - - hive.metastore.warehouse.dir - /apps/hive/warehouse - - - hive.metastore.client.socket.timeout - 60 - - - hive.server2.enable.doAs - true - - - javax.jdo.option.ConnectionURL - jdbc:postgresql://localhost/metastore - - - hive.metastore.execute.setugi - true - - - hive.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat - - - javax.jdo.option.ConnectionPassword - hive - + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://localhost/metastore + + + javax.jdo.option.ConnectionUserName + hive + + + javax.jdo.option.ConnectionPassword + hive + + + hive.metastore.warehouse.dir + /apps/hive/warehouse + diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/single-image/Dockerfile index dc709474..f70dc52b 100644 --- a/tools/hive/single-image/Dockerfile +++ b/tools/hive/single-image/Dockerfile @@ -139,9 +139,7 @@ RUN sed -i -e 's/peer/md5/g' /etc/postgresql/$POSTGRES_VERSION/main/pg_hba.conf RUN mkdir /opt/files RUN echo $HIVE_CONF ADD templates/hive-site.xml.template /opt/files/ -ADD conf/hive-log4j.properties /opt/files/ ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template -ADD conf/hive-log4j.properties $HIVE_CONF/hive-log4j.properties # set permissions for hive bootstrap file ADD scripts/hive-bootstrap.sh /etc/hive-bootstrap.sh diff --git a/tools/hive/single-image/conf/hive-log4j.properties b/tools/hive/single-image/conf/hive-log4j.properties deleted file mode 100644 index b258a503..00000000 --- a/tools/hive/single-image/conf/hive-log4j.properties +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define some default values that can be overridden by system properties -hive.log.threshold=ALL -hive.root.logger=INFO,DRFA -hive.log.dir=/tmp/logs/ -hive.log.file=hive.log - -# Define the root logger to the system property "hadoop.root.logger". -log4j.rootLogger=${hive.root.logger}, EventCounter - -# Logging Threshold -log4j.threshold=${hive.log.threshold} - -# -# Daily Rolling File Appender -# -# Use the PidDailyerRollingFileAppend class instead if you want to use separate log files -# for different CLI session. -# -# log4j.appender.DRFA=org.apache.hadoop.hive.ql.log.PidDailyRollingFileAppender - -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender - -log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file} - -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd - -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout - -# Pattern format: Date LogLevel LoggerName LogMessage -#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n -# Debugging Pattern format -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n - - -# -# console -# Add "console" to rootlogger above if you want to use this -# - -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n -log4j.appender.console.encoding=UTF-8 - -#custom logging levels -#log4j.logger.xxx=DEBUG - -# -# Event Counter Appender -# Sends counts of logging messages at different severity levels to Hadoop Metrics. -# -log4j.appender.EventCounter=org.apache.hadoop.hive.shims.HiveEventCounter - - -log4j.category.DataNucleus=ERROR,DRFA -log4j.category.Datastore=ERROR,DRFA -log4j.category.Datastore.Schema=ERROR,DRFA -log4j.category.JPOX.Datastore=ERROR,DRFA -log4j.category.JPOX.Plugin=ERROR,DRFA -log4j.category.JPOX.MetaData=ERROR,DRFA -log4j.category.JPOX.Query=ERROR,DRFA -log4j.category.JPOX.General=ERROR,DRFA -log4j.category.JPOX.Enhancer=ERROR,DRFA - - -# Silence useless ZK logs -log4j.logger.org.apache.zookeeper.server.NIOServerCnxn=WARN,DRFA -log4j.logger.org.apache.zookeeper.ClientCnxnSocketNIO=WARN,DRFA diff --git a/tools/hive/single-image/conf/log4j.properties b/tools/hive/single-image/conf/log4j.properties deleted file mode 100755 index bd066b0b..00000000 --- a/tools/hive/single-image/conf/log4j.properties +++ /dev/null @@ -1,6 +0,0 @@ -log4j.rootLogger=${hive.root.logger} -hive.root.logger=INFO,console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/tools/hive/single-image/conf/postgresql.conf b/tools/hive/single-image/conf/postgresql.conf index 38d855ed..12b8314a 100644 --- a/tools/hive/single-image/conf/postgresql.conf +++ b/tools/hive/single-image/conf/postgresql.conf @@ -416,7 +416,7 @@ log_rotation_size = 0 # Automatic rotation of logfiles will #log_duration = off #log_error_verbosity = default # terse, default, or verbose messages #log_hostname = off -log_line_prefix = '< %m >' # special values: +#log_line_prefix = '< %m >' # special values: # %a = application name # %u = user name # %d = database name diff --git a/tools/hive/single-image/templates/hive-site.xml.template b/tools/hive/single-image/templates/hive-site.xml.template index 743836ce..8e61b910 100755 --- a/tools/hive/single-image/templates/hive-site.xml.template +++ b/tools/hive/single-image/templates/hive-site.xml.template @@ -1,38 +1,23 @@ + javax.jdo.option.ConnectionDriverName org.postgresql.Driver - - javax.jdo.option.ConnectionUserName - hive - - - hive.metastore.warehouse.dir - /apps/hive/warehouse - - - hive.metastore.client.socket.timeout - 60 - - - hive.server2.enable.doAs - true - javax.jdo.option.ConnectionURL jdbc:postgresql://localhost/metastore - hive.metastore.execute.setugi - true - - - hive.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat + javax.jdo.option.ConnectionUserName + hive javax.jdo.option.ConnectionPassword hive + + hive.metastore.warehouse.dir + /apps/hive/warehouse + From 1bfcff9b7669f6df205a5906b5a552c128307e5e Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Wed, 22 Aug 2018 12:26:50 -0700 Subject: [PATCH 10/15] (1) removed ubuntu/, hadoop/, hive/, (2) removed redundant HADOOP_ env vars, (3) added "{{}}" to templated variable --- tools/hive/hadoop-2.6.0/Dockerfile | 53 -- tools/hive/hadoop-2.6.0/LICENSE | 202 ------ tools/hive/hadoop-2.6.0/scripts/bootstrap.sh | 24 - .../templates/core-site.xml.template | 6 - .../templates/yarn-site.xml.template | 64 -- .../{single-image => hadoop-hive}/Dockerfile | 7 +- .../conf/mapred-site.xml | 0 .../conf/postgresql.conf | 0 .../conf/ssh_config | 0 .../hadoop-hive/scripts/hadoop-bootstrap.sh | 27 + .../scripts/hive-bootstrap.sh | 4 +- .../templates/core-site.xml.template | 2 +- .../templates/hdfs-site.xml.template | 0 .../templates/hive-site.xml.template | 0 .../templates/yarn-site.xml.template | 0 tools/hive/hive_pg/Dockerfile | 78 --- tools/hive/hive_pg/conf/hive-log4j.properties | 88 --- tools/hive/hive_pg/conf/log4j.properties | 6 - tools/hive/hive_pg/conf/postgresql.conf | 630 ------------------ tools/hive/hive_pg/scripts/bootstrap.sh | 49 -- .../hive_pg/templates/hive-site.xml.template | 154 ----- tools/hive/kerberos/Dockerfile | 1 - .../kerberos/templates/core-site.xml.template | 2 +- .../kerberos/templates/hdfs-site.xml.template | 8 +- .../kerberos/templates/hive-site.xml.template | 4 +- .../kerberos/templates/yarn-site.xml.template | 4 +- tools/hive/single-image/conf/mapred-site.xml | 6 - .../single-image/scripts/hadoop-bootstrap.sh | 31 - .../templates/hdfs-site.xml.template | 6 - tools/hive/ubuntu/Dockerfile | 51 -- tools/hive/ubuntu/base.env | 19 - tools/hive/ubuntu/conf/ssh_config | 5 - 32 files changed, 41 insertions(+), 1490 deletions(-) delete mode 100644 tools/hive/hadoop-2.6.0/Dockerfile delete mode 100644 tools/hive/hadoop-2.6.0/LICENSE delete mode 100755 tools/hive/hadoop-2.6.0/scripts/bootstrap.sh delete mode 100644 tools/hive/hadoop-2.6.0/templates/core-site.xml.template delete mode 100644 tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template rename tools/hive/{single-image => hadoop-hive}/Dockerfile (96%) rename tools/hive/{hadoop-2.6.0 => hadoop-hive}/conf/mapred-site.xml (100%) rename tools/hive/{single-image => hadoop-hive}/conf/postgresql.conf (100%) rename tools/hive/{single-image => hadoop-hive}/conf/ssh_config (100%) create mode 100755 tools/hive/hadoop-hive/scripts/hadoop-bootstrap.sh rename tools/hive/{single-image => hadoop-hive}/scripts/hive-bootstrap.sh (82%) rename tools/hive/{single-image => hadoop-hive}/templates/core-site.xml.template (67%) rename tools/hive/{hadoop-2.6.0 => hadoop-hive}/templates/hdfs-site.xml.template (100%) rename tools/hive/{single-image => hadoop-hive}/templates/hive-site.xml.template (100%) rename tools/hive/{single-image => hadoop-hive}/templates/yarn-site.xml.template (100%) delete mode 100644 tools/hive/hive_pg/Dockerfile delete mode 100644 tools/hive/hive_pg/conf/hive-log4j.properties delete mode 100755 tools/hive/hive_pg/conf/log4j.properties delete mode 100644 tools/hive/hive_pg/conf/postgresql.conf delete mode 100644 tools/hive/hive_pg/scripts/bootstrap.sh delete mode 100755 tools/hive/hive_pg/templates/hive-site.xml.template delete mode 100644 tools/hive/single-image/conf/mapred-site.xml delete mode 100755 tools/hive/single-image/scripts/hadoop-bootstrap.sh delete mode 100644 tools/hive/single-image/templates/hdfs-site.xml.template delete mode 100644 tools/hive/ubuntu/Dockerfile delete mode 100644 tools/hive/ubuntu/base.env delete mode 100644 tools/hive/ubuntu/conf/ssh_config diff --git a/tools/hive/hadoop-2.6.0/Dockerfile b/tools/hive/hadoop-2.6.0/Dockerfile deleted file mode 100644 index 43f8e613..00000000 --- a/tools/hive/hadoop-2.6.0/Dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -# Creates pseudo distributed hadoop 2.6.0 in ubuntu -FROM cdh5-ubuntu - -USER root - -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle -ENV HADOOP_VERSION 2.6.0 -ENV CDH_VERSION 5 -ENV CDH_EXACT_VERSION 5.11.0 -ENV HADOOP_HOME /usr/local/hadoop -ENV HADOOP_PREFIX /usr/local/hadoop -ENV HADOOP_COMMON_HOME /usr/local/hadoop -ENV HADOOP_HDFS_HOME /usr/local/hadoop -ENV HADOOP_MAPRED_HOME /usr/local/hadoop -ENV HADOOP_YARN_HOME /usr/local/hadoop -ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop - -ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin - -ADD ./deps/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local -RUN cd /usr/local && ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop - -RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh -RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh - -# copy hadoop site xml files -RUN mkdir $HADOOP_PREFIX/input -RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input - -# pseudo distributed configurations -ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template -ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.template -ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml -ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template - -# format namenode -RUN $HADOOP_PREFIX/bin/hdfs namenode -format - -# fixing the libhadoop.so -RUN rm -rf /usr/local/hadoop/lib/native/* -ADD ./deps/hadoop-native-64-2.6.0.tar /usr/local/hadoop/lib/native/ - -# add and set permissions for bootstrap script -ADD scripts/bootstrap.sh /etc/hadoop-bootstrap.sh -RUN chown root:root /etc/hadoop-bootstrap.sh -RUN chmod 700 /etc/hadoop-bootstrap.sh - -RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh - -#for exposed ports refer -#https://www.cloudera.com/documentation/enterprise/5-4-x/topics/cdh_ig_ports_cdh5.html - -EXPOSE 50010 50020 50070 50075 50090 8020 9000 10020 19888 8030 8031 8032 8033 8040 8042 8088 \ No newline at end of file diff --git a/tools/hive/hadoop-2.6.0/LICENSE b/tools/hive/hadoop-2.6.0/LICENSE deleted file mode 100644 index e06d2081..00000000 --- a/tools/hive/hadoop-2.6.0/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ -Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh b/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh deleted file mode 100755 index 7df86ffe..00000000 --- a/tools/hive/hadoop-2.6.0/scripts/bootstrap.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -set -x -/usr/sbin/sshd -: ${HADOOP_PREFIX:=/usr/local/hadoop} - -$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh - -rm /tmp/*.pid - -# installing libraries if any - (resource urls added comma separated to the ACP system variable) -cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - - -# templating of config files -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml - - -$HADOOP_PREFIX/sbin/start-dfs.sh -$HADOOP_PREFIX/sbin/start-yarn.sh - -if [[ $1 == "-bash" ]]; then - /bin/bash -fi diff --git a/tools/hive/hadoop-2.6.0/templates/core-site.xml.template b/tools/hive/hadoop-2.6.0/templates/core-site.xml.template deleted file mode 100644 index 3576bbd5..00000000 --- a/tools/hive/hadoop-2.6.0/templates/core-site.xml.template +++ /dev/null @@ -1,6 +0,0 @@ - - - fs.defaultFS - hdfs://HOSTNAME:9000 - - diff --git a/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template b/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template deleted file mode 100644 index 37c1850a..00000000 --- a/tools/hive/hadoop-2.6.0/templates/yarn-site.xml.template +++ /dev/null @@ -1,64 +0,0 @@ - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.application.classpath - /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* - - - - - Number of seconds after an application finishes before the nodemanager's - DeletionService will delete the application's localized file directory - and log directory. - - To diagnose Yarn application problems, set this property's value large - enough (for example, to 600 = 10 minutes) to permit examination of these - directories. After changing the property's value, you must restart the - nodemanager in order for it to have an effect. - - The roots of Yarn applications' work directories is configurable with - the yarn.nodemanager.local-dirs property (see below), and the roots - of the Yarn applications' log directories is configurable with the - yarn.nodemanager.log-dirs property (see also below). - - yarn.nodemanager.delete.debug-delay-sec - 600 - - - - Indicate to clients whether Timeline service is enabled or not. - If enabled, the TimelineClient library used by end-users will post entities - and events to the Timeline server. - yarn.timeline-service.enabled - true - - - - The hostname of the Timeline service web application. - yarn.timeline-service.hostname - 0.0.0.0 - - - - Enables cross-origin support (CORS) for web services where - cross-origin web response headers are needed. For example, javascript making - a web services request to the timeline server. - yarn.timeline-service.http-cross-origin.enabled - true - - - - yarn.resourcemanager.webapp.cross-origin.enabled - true - - - - Publish YARN information to Timeline Server - yarn.resourcemanager.system-metrics-publisher.enabled - true - - diff --git a/tools/hive/single-image/Dockerfile b/tools/hive/hadoop-hive/Dockerfile similarity index 96% rename from tools/hive/single-image/Dockerfile rename to tools/hive/hadoop-hive/Dockerfile index f70dc52b..c7a97d53 100644 --- a/tools/hive/single-image/Dockerfile +++ b/tools/hive/hadoop-hive/Dockerfile @@ -8,10 +8,6 @@ ENV CDH_VERSION 5 ENV CDH_EXACT_VERSION 5.11.0 ENV HADOOP_HOME /usr/local/hadoop ENV HADOOP_PREFIX /usr/local/hadoop -ENV HADOOP_COMMON_HOME /usr/local/hadoop -ENV HADOOP_HDFS_HOME /usr/local/hadoop -ENV HADOOP_MAPRED_HOME /usr/local/hadoop -ENV HADOOP_YARN_HOME /usr/local/hadoop ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop ENV HIVE_HOME /usr/local/hive ENV HIVE_CONF /usr/local/hive/conf @@ -72,9 +68,10 @@ RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/had cd /usr/local && \ ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop +# need to define JAVA_HOME inside hadoop-env.sh RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh -# pseudo distributed configurations +# pseudo distributed configurations of hadoop ADD templates/core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.template ADD conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml diff --git a/tools/hive/hadoop-2.6.0/conf/mapred-site.xml b/tools/hive/hadoop-hive/conf/mapred-site.xml similarity index 100% rename from tools/hive/hadoop-2.6.0/conf/mapred-site.xml rename to tools/hive/hadoop-hive/conf/mapred-site.xml diff --git a/tools/hive/single-image/conf/postgresql.conf b/tools/hive/hadoop-hive/conf/postgresql.conf similarity index 100% rename from tools/hive/single-image/conf/postgresql.conf rename to tools/hive/hadoop-hive/conf/postgresql.conf diff --git a/tools/hive/single-image/conf/ssh_config b/tools/hive/hadoop-hive/conf/ssh_config similarity index 100% rename from tools/hive/single-image/conf/ssh_config rename to tools/hive/hadoop-hive/conf/ssh_config diff --git a/tools/hive/hadoop-hive/scripts/hadoop-bootstrap.sh b/tools/hive/hadoop-hive/scripts/hadoop-bootstrap.sh new file mode 100755 index 00000000..9f034511 --- /dev/null +++ b/tools/hive/hadoop-hive/scripts/hadoop-bootstrap.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -x + +# used by Hadoop +/usr/sbin/sshd + +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# templating of config files +sed s/{{HOSTNAME}}/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml +sed s/{{HOSTNAME}}/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml +sed s/{{HOSTNAME}}/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml + +# format namenode +$HADOOP_PREFIX/bin/hdfs namenode -format + +# start hdfs and yarn +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/tools/hive/single-image/scripts/hive-bootstrap.sh b/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh similarity index 82% rename from tools/hive/single-image/scripts/hive-bootstrap.sh rename to tools/hive/hadoop-hive/scripts/hive-bootstrap.sh index 1dae776a..4d40308f 100644 --- a/tools/hive/single-image/scripts/hive-bootstrap.sh +++ b/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh @@ -28,8 +28,8 @@ hdfs dfs -chown -R hive:supergroup /apps/hive hdfs dfs -chmod 777 /apps/hive/warehouse # altering the hive-site configuration -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hive/conf/hive-site.xml.template > /usr/local/hive/conf/hive-site.xml -sed s/HOSTNAME/$HOSTNAME/ /opt/files/hive-site.xml.template > /opt/files/hive-site.xml +sed s/{{HOSTNAME}}/$HOSTNAME/ /usr/local/hive/conf/hive-site.xml.template > /usr/local/hive/conf/hive-site.xml +sed s/{{HOSTNAME}}/$HOSTNAME/ /opt/files/hive-site.xml.template > /opt/files/hive-site.xml # start hive metastore server $HIVE_HOME/bin/hive --service metastore & diff --git a/tools/hive/single-image/templates/core-site.xml.template b/tools/hive/hadoop-hive/templates/core-site.xml.template similarity index 67% rename from tools/hive/single-image/templates/core-site.xml.template rename to tools/hive/hadoop-hive/templates/core-site.xml.template index 3576bbd5..11dc4a93 100644 --- a/tools/hive/single-image/templates/core-site.xml.template +++ b/tools/hive/hadoop-hive/templates/core-site.xml.template @@ -1,6 +1,6 @@ fs.defaultFS - hdfs://HOSTNAME:9000 + hdfs://{{HOSTNAME}}:9000 diff --git a/tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template b/tools/hive/hadoop-hive/templates/hdfs-site.xml.template similarity index 100% rename from tools/hive/hadoop-2.6.0/templates/hdfs-site.xml.template rename to tools/hive/hadoop-hive/templates/hdfs-site.xml.template diff --git a/tools/hive/single-image/templates/hive-site.xml.template b/tools/hive/hadoop-hive/templates/hive-site.xml.template similarity index 100% rename from tools/hive/single-image/templates/hive-site.xml.template rename to tools/hive/hadoop-hive/templates/hive-site.xml.template diff --git a/tools/hive/single-image/templates/yarn-site.xml.template b/tools/hive/hadoop-hive/templates/yarn-site.xml.template similarity index 100% rename from tools/hive/single-image/templates/yarn-site.xml.template rename to tools/hive/hadoop-hive/templates/yarn-site.xml.template diff --git a/tools/hive/hive_pg/Dockerfile b/tools/hive/hive_pg/Dockerfile deleted file mode 100644 index 1911d77d..00000000 --- a/tools/hive/hive_pg/Dockerfile +++ /dev/null @@ -1,78 +0,0 @@ -FROM cdh5-hadoop - -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle -ENV HIVE_HOME /usr/local/hive -ENV HIVE_CONF /usr/local/hive/conf -ENV HIVE_VERSION 1.1.0 -ENV HADOOP_VERSION 2.6.0 -ENV CDH_VERSION 5 -ENV CDH_EXACT_VERSION 5.11.0 -ENV POSTGRES_VERSION 9.5 -ENV POSTGRESQL_MAIN /var/lib/postgresql/9.5/main/ -ENV POSTGRESQL_CONFIG_FILE /var/lib/postgresql/9.5/main/postgresql.conf -ENV POSTGRESQL_BIN /usr/lib/postgresql/9.5/bin/postgres -ENV PGPASSWORD hive -ENV HADOOP_HOME /usr/local/hadoop -ENV HADOOP_PREFIX /usr/local/hadoop -ENV HADOOP_COMMON_HOME /usr/local/hadoop -ENV HADOOP_HDFS_HOME /usr/local/hadoop -ENV HADOOP_MAPRED_HOME /usr/local/hadoop -ENV HADOOP_YARN_HOME /usr/local/hadoop -ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop - -ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin - -# add hive -ADD ./deps/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz /usr/local/ -RUN mv /usr/local/hive-${HIVE_VERSION}-cdh${CDH_EXACT_VERSION} /usr/local/hive - - -# add postgresql jdbc jar to classpath -RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar - -# to configure postgres as hive metastore backend -RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" >> /etc/apt/sources.list.d/pgdg.list' -RUN wget -q https://www.postgresql.org/media/keys/ACCC4CF8.asc -O - | sudo apt-key add - -RUN apt-get update -y -RUN apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java - -USER postgres -# initialize hive metastore db -# create metastore db, hive user and assign privileges -RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ &&\ - /etc/init.d/postgresql start &&\ - psql --command "CREATE DATABASE metastore;" &&\ - psql --command "CREATE USER hive WITH PASSWORD 'hive';" && \ - psql --command "ALTER USER hive WITH SUPERUSER;" && \ - psql --command "GRANT ALL PRIVILEGES ON DATABASE metastore TO hive;" && \ - psql -U hive -d metastore -h localhost -f hive-schema-${HIVE_VERSION}.postgres.sql - - -# revert back to default user -USER root - -# disable ssl in postgres.conf -ADD conf/postgresql.conf $POSTGRESQL_MAIN -RUN echo $POSTGRESQL_MAIN -RUN echo $POSTGRESQL_CONFIG_FILE -RUN chown postgres:postgres $POSTGRESQL_CONFIG_FILE -RUN sed -i -e 's/peer/md5/g' /etc/postgresql/$POSTGRES_VERSION/main/pg_hba.conf - - -# copy config, sql, data files to /opt/files -RUN mkdir /opt/files -RUN echo $HIVE_CONF -ADD templates/hive-site.xml.template /opt/files/ -ADD conf/hive-log4j.properties /opt/files/ -ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template -ADD conf/hive-log4j.properties $HIVE_CONF/hive-log4j.properties - -# set permissions for hive bootstrap file -ADD scripts/bootstrap.sh /etc/hive-bootstrap.sh -RUN chown root:root /etc/hive-bootstrap.sh -RUN chmod 700 /etc/hive-bootstrap.sh - -EXPOSE 10000 10001 10002 10003 9083 50111 5432 - -# run bootstrap script -CMD ["/etc/hive-bootstrap.sh", "-d"] diff --git a/tools/hive/hive_pg/conf/hive-log4j.properties b/tools/hive/hive_pg/conf/hive-log4j.properties deleted file mode 100644 index b258a503..00000000 --- a/tools/hive/hive_pg/conf/hive-log4j.properties +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define some default values that can be overridden by system properties -hive.log.threshold=ALL -hive.root.logger=INFO,DRFA -hive.log.dir=/tmp/logs/ -hive.log.file=hive.log - -# Define the root logger to the system property "hadoop.root.logger". -log4j.rootLogger=${hive.root.logger}, EventCounter - -# Logging Threshold -log4j.threshold=${hive.log.threshold} - -# -# Daily Rolling File Appender -# -# Use the PidDailyerRollingFileAppend class instead if you want to use separate log files -# for different CLI session. -# -# log4j.appender.DRFA=org.apache.hadoop.hive.ql.log.PidDailyRollingFileAppender - -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender - -log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file} - -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd - -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout - -# Pattern format: Date LogLevel LoggerName LogMessage -#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n -# Debugging Pattern format -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n - - -# -# console -# Add "console" to rootlogger above if you want to use this -# - -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n -log4j.appender.console.encoding=UTF-8 - -#custom logging levels -#log4j.logger.xxx=DEBUG - -# -# Event Counter Appender -# Sends counts of logging messages at different severity levels to Hadoop Metrics. -# -log4j.appender.EventCounter=org.apache.hadoop.hive.shims.HiveEventCounter - - -log4j.category.DataNucleus=ERROR,DRFA -log4j.category.Datastore=ERROR,DRFA -log4j.category.Datastore.Schema=ERROR,DRFA -log4j.category.JPOX.Datastore=ERROR,DRFA -log4j.category.JPOX.Plugin=ERROR,DRFA -log4j.category.JPOX.MetaData=ERROR,DRFA -log4j.category.JPOX.Query=ERROR,DRFA -log4j.category.JPOX.General=ERROR,DRFA -log4j.category.JPOX.Enhancer=ERROR,DRFA - - -# Silence useless ZK logs -log4j.logger.org.apache.zookeeper.server.NIOServerCnxn=WARN,DRFA -log4j.logger.org.apache.zookeeper.ClientCnxnSocketNIO=WARN,DRFA diff --git a/tools/hive/hive_pg/conf/log4j.properties b/tools/hive/hive_pg/conf/log4j.properties deleted file mode 100755 index bd066b0b..00000000 --- a/tools/hive/hive_pg/conf/log4j.properties +++ /dev/null @@ -1,6 +0,0 @@ -log4j.rootLogger=${hive.root.logger} -hive.root.logger=INFO,console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/tools/hive/hive_pg/conf/postgresql.conf b/tools/hive/hive_pg/conf/postgresql.conf deleted file mode 100644 index 38d855ed..00000000 --- a/tools/hive/hive_pg/conf/postgresql.conf +++ /dev/null @@ -1,630 +0,0 @@ -# ----------------------------- -# PostgreSQL configuration file -# ----------------------------- -# -# This file consists of lines of the form: -# -# name = value -# -# (The "=" is optional.) Whitespace may be used. Comments are introduced with -# "#" anywhere on a line. The complete list of parameter names and allowed -# values can be found in the PostgreSQL documentation. -# -# The commented-out settings shown in this file represent the default values. -# Re-commenting a setting is NOT sufficient to revert it to the default value; -# you need to reload the server. -# -# This file is read on server startup and when the server receives a SIGHUP -# signal. If you edit the file on a running system, you have to SIGHUP the -# server for the changes to take effect, or use "pg_ctl reload". Some -# parameters, which are marked below, require a server shutdown and restart to -# take effect. -# -# Any parameter can also be given as a command-line option to the server, e.g., -# "postgres -c log_connections=on". Some parameters can be changed at run time -# with the "SET" SQL command. -# -# Memory units: kB = kilobytes Time units: ms = milliseconds -# MB = megabytes s = seconds -# GB = gigabytes min = minutes -# TB = terabytes h = hours -# d = days - - -#------------------------------------------------------------------------------ -# FILE LOCATIONS -#------------------------------------------------------------------------------ - -# The default values of these variables are driven from the -D command-line -# option or PGDATA environment variable, represented here as ConfigDir. - -data_directory = '/var/lib/postgresql/9.5/main' # use data in another directory -# (change requires restart) -hba_file = '/var/lib/postgresql/9.5/pg_hba.conf' # host-based authentication file -# (change requires restart) -ident_file = '/var/lib/postgresql/9.5/pg_ident.conf' # ident configuration file -# (change requires restart) - -# If external_pid_file is not explicitly set, no extra PID file is written. -#external_pid_file = '' # write an extra PID file -# (change requires restart) - - -#------------------------------------------------------------------------------ -# CONNECTIONS AND AUTHENTICATION -#------------------------------------------------------------------------------ - -# - Connection Settings - - -listen_addresses = '*' # what IP address(es) to listen on; -# comma-separated list of addresses; -# defaults to 'localhost'; use '*' for all -# (change requires restart) -port = 5432 # (change requires restart) -max_connections = 100 # (change requires restart) -# Note: Increasing max_connections costs ~400 bytes of shared memory per -# connection slot, plus lock space (see max_locks_per_transaction). -#superuser_reserved_connections = 3 # (change requires restart) -#unix_socket_directories = '/tmp' # comma-separated list of directories -# (change requires restart) -#unix_socket_group = '' # (change requires restart) -#unix_socket_permissions = 0777 # begin with 0 to use octal notation -# (change requires restart) -#bonjour = off # advertise server via Bonjour -# (change requires restart) -#bonjour_name = '' # defaults to the computer name -# (change requires restart) - -# - Security and Authentication - - -#authentication_timeout = 1min # 1s-600s -ssl = off # (change requires restart) -#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers -# (change requires restart) -#ssl_prefer_server_ciphers = on # (change requires restart) -#ssl_ecdh_curve = 'prime256v1' # (change requires restart) -#ssl_cert_file = 'server.crt' # (change requires restart) -#ssl_key_file = 'server.key' # (change requires restart) -#ssl_ca_file = '' # (change requires restart) -#ssl_crl_file = '' # (change requires restart) -#password_encryption = on -#db_user_namespace = off -#row_security = on - -# GSSAPI using Kerberos -#krb_server_keyfile = '' -#krb_caseins_users = off - -# - TCP Keepalives - -# see "man 7 tcp" for details - -#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; -# 0 selects the system default -#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; -# 0 selects the system default -#tcp_keepalives_count = 0 # TCP_KEEPCNT; -# 0 selects the system default - - -#------------------------------------------------------------------------------ -# RESOURCE USAGE (except WAL) -#------------------------------------------------------------------------------ - -# - Memory - - -shared_buffers = 128MB # min 128kB -# (change requires restart) -#huge_pages = try # on, off, or try -# (change requires restart) -#temp_buffers = 8MB # min 800kB -#max_prepared_transactions = 0 # zero disables the feature -# (change requires restart) -# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory -# per transaction slot, plus lock space (see max_locks_per_transaction). -# It is not advisable to set max_prepared_transactions nonzero unless you -# actively intend to use prepared transactions. -#work_mem = 4MB # min 64kB -#maintenance_work_mem = 64MB # min 1MB -#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem -#max_stack_depth = 2MB # min 100kB -#dynamic_shared_memory_type = posix # the default is the first option -# supported by the operating system: -# posix -# sysv -# windows -# mmap -# use none to disable dynamic shared memory - -# - Disk - - -#temp_file_limit = -1 # limits per-session temp file space -# in kB, or -1 for no limit - -# - Kernel Resource Usage - - -#max_files_per_process = 1000 # min 25 -# (change requires restart) -#shared_preload_libraries = '' # (change requires restart) - -# - Cost-Based Vacuum Delay - - -#vacuum_cost_delay = 0 # 0-100 milliseconds -#vacuum_cost_page_hit = 1 # 0-10000 credits -#vacuum_cost_page_miss = 10 # 0-10000 credits -#vacuum_cost_page_dirty = 20 # 0-10000 credits -#vacuum_cost_limit = 200 # 1-10000 credits - -# - Background Writer - - -#bgwriter_delay = 200ms # 10-10000ms between rounds -#bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round -#bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round - -# - Asynchronous Behavior - - -#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching -#max_worker_processes = 8 - - -#------------------------------------------------------------------------------ -# WRITE AHEAD LOG -#------------------------------------------------------------------------------ - -# - Settings - - -#wal_level = minimal # minimal, archive, hot_standby, or logical -# (change requires restart) -#fsync = on # turns forced synchronization on or off -#synchronous_commit = on # synchronization level; -# off, local, remote_write, or on -#wal_sync_method = fsync # the default is the first option -# supported by the operating system: -# open_datasync -# fdatasync (default on Linux) -# fsync -# fsync_writethrough -# open_sync -#full_page_writes = on # recover from partial page writes -#wal_compression = off # enable compression of full-page writes -#wal_log_hints = off # also do full page writes of non-critical updates -# (change requires restart) -#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers -# (change requires restart) -#wal_writer_delay = 200ms # 1-10000 milliseconds - -#commit_delay = 0 # range 0-100000, in microseconds -#commit_siblings = 5 # range 1-1000 - -# - Checkpoints - - -#checkpoint_timeout = 5min # range 30s-1h -#max_wal_size = 1GB -#min_wal_size = 80MB -#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 -#checkpoint_warning = 30s # 0 disables - -# - Archiving - - -#archive_mode = off # enables archiving; off, on, or always -# (change requires restart) -#archive_command = '' # command to use to archive a logfile segment -# placeholders: %p = path of file to archive -# %f = file name only -# e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' -#archive_timeout = 0 # force a logfile segment switch after this -# number of seconds; 0 disables - - -#------------------------------------------------------------------------------ -# REPLICATION -#------------------------------------------------------------------------------ - -# - Sending Server(s) - - -# Set these on the master and on any standby that will send replication data. - -#max_wal_senders = 0 # max number of walsender processes -# (change requires restart) -#wal_keep_segments = 0 # in logfile segments, 16MB each; 0 disables -#wal_sender_timeout = 60s # in milliseconds; 0 disables - -#max_replication_slots = 0 # max number of replication slots -# (change requires restart) -#track_commit_timestamp = off # collect timestamp of transaction commit -# (change requires restart) - -# - Master Server - - -# These settings are ignored on a standby server. - -#synchronous_standby_names = '' # standby servers that provide sync rep -# comma-separated list of application_name -# from standby(s); '*' = all -#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed - -# - Standby Servers - - -# These settings are ignored on a master server. - -#hot_standby = off # "on" allows queries during recovery -# (change requires restart) -#max_standby_archive_delay = 30s # max delay before canceling queries -# when reading WAL from archive; -# -1 allows indefinite delay -#max_standby_streaming_delay = 30s # max delay before canceling queries -# when reading streaming WAL; -# -1 allows indefinite delay -#wal_receiver_status_interval = 10s # send replies at least this often -# 0 disables -#hot_standby_feedback = off # send info from standby to prevent -# query conflicts -#wal_receiver_timeout = 60s # time that receiver waits for -# communication from master -# in milliseconds; 0 disables -#wal_retrieve_retry_interval = 5s # time to wait before retrying to -# retrieve WAL after a failed attempt - - -#------------------------------------------------------------------------------ -# QUERY TUNING -#------------------------------------------------------------------------------ - -# - Planner Method Configuration - - -#enable_bitmapscan = on -#enable_hashagg = on -#enable_hashjoin = on -#enable_indexscan = on -#enable_indexonlyscan = on -#enable_material = on -#enable_mergejoin = on -#enable_nestloop = on -#enable_seqscan = on -#enable_sort = on -#enable_tidscan = on - -# - Planner Cost Constants - - -#seq_page_cost = 1.0 # measured on an arbitrary scale -#random_page_cost = 4.0 # same scale as above -#cpu_tuple_cost = 0.01 # same scale as above -#cpu_index_tuple_cost = 0.005 # same scale as above -#cpu_operator_cost = 0.0025 # same scale as above -#effective_cache_size = 4GB - -# - Genetic Query Optimizer - - -#geqo = on -#geqo_threshold = 12 -#geqo_effort = 5 # range 1-10 -#geqo_pool_size = 0 # selects default based on effort -#geqo_generations = 0 # selects default based on effort -#geqo_selection_bias = 2.0 # range 1.5-2.0 -#geqo_seed = 0.0 # range 0.0-1.0 - -# - Other Planner Options - - -#default_statistics_target = 100 # range 1-10000 -#constraint_exclusion = partition # on, off, or partition -#cursor_tuple_fraction = 0.1 # range 0.0-1.0 -#from_collapse_limit = 8 -#join_collapse_limit = 8 # 1 disables collapsing of explicit -# JOIN clauses - - -#------------------------------------------------------------------------------ -# ERROR REPORTING AND LOGGING -#------------------------------------------------------------------------------ - -# - Where to Log - - -log_destination = 'stderr' # Valid values are combinations of -# stderr, csvlog, syslog, and eventlog, -# depending on platform. csvlog -# requires logging_collector to be on. - -# This is used when logging to stderr: -logging_collector = on # Enable capturing of stderr and csvlog -# into log files. Required to be on for -# csvlogs. -# (change requires restart) - -# These are only used if logging_collector is on: -log_directory = 'pg_log' # directory where log files are written, -# can be absolute or relative to PGDATA -log_filename = 'postgresql-%a.log' # log file name pattern, -# can include strftime() escapes -#log_file_mode = 0600 # creation mode for log files, -# begin with 0 to use octal notation -log_truncate_on_rotation = on # If on, an existing log file with the -# same name as the new log file will be -# truncated rather than appended to. -# But such truncation only occurs on -# time-driven rotation, not on restarts -# or size-driven rotation. Default is -# off, meaning append to existing files -# in all cases. -log_rotation_age = 1d # Automatic rotation of logfiles will -# happen after that time. 0 disables. -log_rotation_size = 0 # Automatic rotation of logfiles will -# happen after that much log output. -# 0 disables. - -# These are relevant when logging to syslog: -#syslog_facility = 'LOCAL0' -#syslog_ident = 'postgres' - -# This is only relevant when logging to eventlog (win32): -#event_source = 'PostgreSQL' - -# - When to Log - - -#client_min_messages = notice # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# log -# notice -# warning -# error - -#log_min_messages = warning # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# info -# notice -# warning -# error -# log -# fatal -# panic - -#log_min_error_statement = error # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# info -# notice -# warning -# error -# log -# fatal -# panic (effectively off) - -#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements -# and their durations, > 0 logs only -# statements running at least this number -# of milliseconds - - -# - What to Log - - -#debug_print_parse = off -#debug_print_rewritten = off -#debug_print_plan = off -#debug_pretty_print = on -#log_checkpoints = off -#log_connections = off -#log_disconnections = off -#log_duration = off -#log_error_verbosity = default # terse, default, or verbose messages -#log_hostname = off -log_line_prefix = '< %m >' # special values: -# %a = application name -# %u = user name -# %d = database name -# %r = remote host and port -# %h = remote host -# %p = process ID -# %t = timestamp without milliseconds -# %m = timestamp with milliseconds -# %i = command tag -# %e = SQL state -# %c = session ID -# %l = session line number -# %s = session start timestamp -# %v = virtual transaction ID -# %x = transaction ID (0 if none) -# %q = stop here in non-session -# processes -# %% = '%' -# e.g. '<%u%%%d> ' -#log_lock_waits = off # log lock waits >= deadlock_timeout -#log_statement = 'none' # none, ddl, mod, all -#log_replication_commands = off -#log_temp_files = -1 # log temporary files equal or larger -# than the specified size in kilobytes; -# -1 disables, 0 logs all temp files -#log_timezone = 'GMT' - - -# - Process Title - - -#cluster_name = '' # added to process titles if nonempty -# (change requires restart) -#update_process_title = on - - -#------------------------------------------------------------------------------ -# RUNTIME STATISTICS -#------------------------------------------------------------------------------ - -# - Query/Index Statistics Collector - - -#track_activities = on -#track_counts = on -#track_io_timing = off -#track_functions = none # none, pl, all -#track_activity_query_size = 1024 # (change requires restart) -#stats_temp_directory = 'pg_stat_tmp' - - -# - Statistics Monitoring - - -#log_parser_stats = off -#log_planner_stats = off -#log_executor_stats = off -#log_statement_stats = off - - -#------------------------------------------------------------------------------ -# AUTOVACUUM PARAMETERS -#------------------------------------------------------------------------------ - -#autovacuum = on # Enable autovacuum subprocess? 'on' -# requires track_counts to also be on. -#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and -# their durations, > 0 logs only -# actions running at least this number -# of milliseconds. -#autovacuum_max_workers = 3 # max number of autovacuum subprocesses -# (change requires restart) -#autovacuum_naptime = 1min # time between autovacuum runs -#autovacuum_vacuum_threshold = 50 # min number of row updates before -# vacuum -#autovacuum_analyze_threshold = 50 # min number of row updates before -# analyze -#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum -#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze -#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum -# (change requires restart) -#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age -# before forced vacuum -# (change requires restart) -#autovacuum_vacuum_cost_delay = 20ms # default vacuum cost delay for -# autovacuum, in milliseconds; -# -1 means use vacuum_cost_delay -#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for -# autovacuum, -1 means use -# vacuum_cost_limit - - -#------------------------------------------------------------------------------ -# CLIENT CONNECTION DEFAULTS -#------------------------------------------------------------------------------ - -# - Statement Behavior - - -#search_path = '"$user", public' # schema names -#default_tablespace = '' # a tablespace name, '' uses the default -#temp_tablespaces = '' # a list of tablespace names, '' uses -# only default tablespace -#check_function_bodies = on -#default_transaction_isolation = 'read committed' -#default_transaction_read_only = off -#default_transaction_deferrable = off -#session_replication_role = 'origin' -#statement_timeout = 0 # in milliseconds, 0 is disabled -#lock_timeout = 0 # in milliseconds, 0 is disabled -#vacuum_freeze_min_age = 50000000 -#vacuum_freeze_table_age = 150000000 -#vacuum_multixact_freeze_min_age = 5000000 -#vacuum_multixact_freeze_table_age = 150000000 -#bytea_output = 'hex' # hex, escape -#xmlbinary = 'base64' -#xmloption = 'content' -#gin_fuzzy_search_limit = 0 -#gin_pending_list_limit = 4MB - -# - Locale and Formatting - - -#datestyle = 'iso, mdy' -#intervalstyle = 'postgres' -#timezone = 'GMT' -#timezone_abbreviations = 'Default' # Select the set of available time zone -# abbreviations. Currently, there are -# Default -# Australia (historical usage) -# India -# You can create your own file in -# share/timezonesets/. -#extra_float_digits = 0 # min -15, max 3 -#client_encoding = sql_ascii # actually, defaults to database -# encoding - -# These settings are initialized by initdb, but they can be changed. -#lc_messages = 'C' # locale for system error message -# strings -#lc_monetary = 'C' # locale for monetary formatting -#lc_numeric = 'C' # locale for number formatting -#lc_time = 'C' # locale for time formatting - -# default configuration for text search -#default_text_search_config = 'pg_catalog.simple' - -# - Other Defaults - - -#dynamic_library_path = '$libdir' -#local_preload_libraries = '' -#session_preload_libraries = '' - - -#------------------------------------------------------------------------------ -# LOCK MANAGEMENT -#------------------------------------------------------------------------------ - -#deadlock_timeout = 1s -#max_locks_per_transaction = 64 # min 10 -# (change requires restart) -# Note: Each lock table slot uses ~270 bytes of shared memory, and there are -# max_locks_per_transaction * (max_connections + max_prepared_transactions) -# lock table slots. -#max_pred_locks_per_transaction = 64 # min 10 -# (change requires restart) - - -#------------------------------------------------------------------------------ -# VERSION/PLATFORM COMPATIBILITY -#------------------------------------------------------------------------------ - -# - Previous PostgreSQL Versions - - -#array_nulls = on -#backslash_quote = safe_encoding # on, off, or safe_encoding -#default_with_oids = off -#escape_string_warning = on -#lo_compat_privileges = off -#operator_precedence_warning = off -#quote_all_identifiers = off -#sql_inheritance = on -#standard_conforming_strings = on -#synchronize_seqscans = on - -# - Other Platforms and Clients - - -#transform_null_equals = off - - -#------------------------------------------------------------------------------ -# ERROR HANDLING -#------------------------------------------------------------------------------ - -#exit_on_error = off # terminate session on any error? -#restart_after_crash = on # reinitialize after backend crash? - - -#------------------------------------------------------------------------------ -# CONFIG FILE INCLUDES -#------------------------------------------------------------------------------ - -# These options allow settings to be loaded from files other than the -# default postgresql.conf. - -#include_dir = 'conf.d' # include files ending in '.conf' from -# directory 'conf.d' -#include_if_exists = 'exists.conf' # include file only if it exists -#include = 'special.conf' # include file - - -#------------------------------------------------------------------------------ -# CUSTOMIZED OPTIONS -#------------------------------------------------------------------------------ - -# Add settings for extensions here diff --git a/tools/hive/hive_pg/scripts/bootstrap.sh b/tools/hive/hive_pg/scripts/bootstrap.sh deleted file mode 100644 index 15c6451f..00000000 --- a/tools/hive/hive_pg/scripts/bootstrap.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set -x -#save all env vars .bashrc for ssh sessions -printenv | cat >> /root/.bashrc - -# hadoop bootstrap -/etc/hadoop-bootstrap.sh -d - -# restart postgresql -sudo /etc/init.d/postgresql restart - -# kinit for kerberos mode -if command -v kinit 2>/dev/null; then - kinit -k -t /usr/local/hadoop/etc/hadoop/hdfs.keytab hdfs@LOCAL -fi - -until hdfs dfs -ls / -do - echo "waiting for hdfs to be ready"; sleep 10; -done - -# create hdfs directories -$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root -hdfs dfs -chown -R hdfs:supergroup /user - -$HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /apps/hive/warehouse -hdfs dfs -chown -R hive:supergroup /apps/hive -hdfs dfs -chmod 777 /apps/hive/warehouse - -# altering the hive-site configuration -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hive/conf/hive-site.xml.template > /usr/local/hive/conf/hive-site.xml -sed s/HOSTNAME/$HOSTNAME/ /opt/files/hive-site.xml.template > /opt/files/hive-site.xml - -# start hive metastore server -$HIVE_HOME/bin/hive --service metastore & - -sleep 20 - -# start hive server -$HIVE_HOME/bin/hive --service hiveserver2 & - - -if [[ $1 == "-bash" ]]; then - /bin/bash -fi - -if [[ $1 == "-d" ]]; then - while true; do sleep 10000; done -fi diff --git a/tools/hive/hive_pg/templates/hive-site.xml.template b/tools/hive/hive_pg/templates/hive-site.xml.template deleted file mode 100755 index d22861b4..00000000 --- a/tools/hive/hive_pg/templates/hive-site.xml.template +++ /dev/null @@ -1,154 +0,0 @@ - - - hive.metastore.cache.pinobjtypes - Table,Database,Type,FieldSchema,Order - - - javax.jdo.option.ConnectionDriverName - org.postgresql.Driver - - - javax.jdo.option.ConnectionUserName - hive - - - hive.auto.convert.join - true - - - fs.hdfs.impl.disable.cache - true - - - fs.file.impl.disable.cache - true - - - hive.metastore.warehouse.dir - /apps/hive/warehouse - - - hive.auto.convert.sortmerge.join - true - - - hive.metastore.client.socket.timeout - 60 - - - hive.optimize.bucketmapjoin - true - - - hive.optimize.bucketmapjoin.sortedmerge - true - - - hive.optimize.index.filter - true - - - hive.auto.convert.join.noconditionaltask.size - 1000000000 - - - hive.auto.convert.join.noconditionaltask - true - - - hive.mapjoin.bucket.cache.size - 10000 - - - hive.vectorized.execution.enabled - true - - - hive.security.authorization.enabled - false - - - hive.optimize.reducededuplication.min.reducer - 4 - - - hive.server2.enable.doAs - true - - - hive.mapred.reduce.tasks.speculative.execution - false - - - javax.jdo.option.ConnectionURL - jdbc:postgresql://localhost/metastore - - - hive.enforce.bucketing - true - - - hive.metastore.execute.setugi - true - - - hive.enforce.sorting - true - - - hive.security.authorization.manager - org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider - - - hive.map.aggr - true - - - hive.optimize.reducededuplication - true - - - - hive.vectorized.execution.enabled - true - - - hive.vectorized.groupby.maxentries - 10000 - - - hive.vectorized.groupby.checkinterval - 10000 - - - hive.input.format - org.apache.hadoop.hive.ql.io.HiveInputFormat - - - javax.jdo.option.ConnectionPassword - hive - - - tez.am.node-blacklisting.enabled - false - - - hive.prewarm.numcontainers - 3 - - Controls the number of containers to prewarm for tez (hadoop 2 only) - - - - mapred.tez.java.opts - -Xmx256m - - - hive.tez.container.size - 256 - - - diff --git a/tools/hive/kerberos/Dockerfile b/tools/hive/kerberos/Dockerfile index ca3c0188..19559f65 100644 --- a/tools/hive/kerberos/Dockerfile +++ b/tools/hive/kerberos/Dockerfile @@ -6,7 +6,6 @@ ADD templates/hdfs-site.xml.template $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml.tem ADD templates/yarn-site.xml.template $HADOOP_PREFIX/etc/hadoop/yarn-site.xml.template # copy kerberized hive config file -RUN echo $HIVE_CONF ADD templates/hive-site.xml.template /opt/files/ ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template diff --git a/tools/hive/kerberos/templates/core-site.xml.template b/tools/hive/kerberos/templates/core-site.xml.template index a08c4d74..44933bbc 100644 --- a/tools/hive/kerberos/templates/core-site.xml.template +++ b/tools/hive/kerberos/templates/core-site.xml.template @@ -1,7 +1,7 @@ fs.defaultFS - hdfs://HOSTNAME:9000 + hdfs://{{HOSTNAME}}:9000 diff --git a/tools/hive/kerberos/templates/hdfs-site.xml.template b/tools/hive/kerberos/templates/hdfs-site.xml.template index 4fa0a560..03ec5444 100644 --- a/tools/hive/kerberos/templates/hdfs-site.xml.template +++ b/tools/hive/kerberos/templates/hdfs-site.xml.template @@ -17,7 +17,7 @@ dfs.namenode.kerberos.principal - hdfs/HOSTNAME@LOCAL + hdfs/{{HOSTNAME}}@LOCAL @@ -27,7 +27,7 @@ dfs.secondary.namenode.kerberos.principal - hdfs/HOSTNAME@LOCAL + hdfs/{{HOSTNAME}}@LOCAL @@ -37,13 +37,13 @@ dfs.datanode.kerberos.principal - hdfs/HOSTNAME@LOCAL + hdfs/{{HOSTNAME}}@LOCAL dfs.web.authentication.kerberos.principal - HTTP/HOSTNAME@LOCAL + HTTP/{{HOSTNAME}}@LOCAL diff --git a/tools/hive/kerberos/templates/hive-site.xml.template b/tools/hive/kerberos/templates/hive-site.xml.template index e2bdc064..b795077c 100755 --- a/tools/hive/kerberos/templates/hive-site.xml.template +++ b/tools/hive/kerberos/templates/hive-site.xml.template @@ -29,7 +29,7 @@ hive.server2.authentication.kerberos.principal - hive/HOSTNAME@LOCAL + hive/{{HOSTNAME}}@LOCAL @@ -49,7 +49,7 @@ hive.metastore.kerberos.principal - hive/HOSTNAME@LOCAL + hive/{{HOSTNAME}}@LOCAL diff --git a/tools/hive/kerberos/templates/yarn-site.xml.template b/tools/hive/kerberos/templates/yarn-site.xml.template index f110584f..fe5bd154 100644 --- a/tools/hive/kerberos/templates/yarn-site.xml.template +++ b/tools/hive/kerberos/templates/yarn-site.xml.template @@ -7,7 +7,7 @@ yarn.resourcemanager.principal - yarn/HOSTNAME@LOCAL + yarn/{{HOSTNAME}}@LOCAL @@ -18,6 +18,6 @@ yarn.nodemanager.principal - yarn/HOSTNAME@LOCAL + yarn/{{HOSTNAME}}@LOCAL diff --git a/tools/hive/single-image/conf/mapred-site.xml b/tools/hive/single-image/conf/mapred-site.xml deleted file mode 100644 index dba582f1..00000000 --- a/tools/hive/single-image/conf/mapred-site.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - mapreduce.framework.name - yarn - - diff --git a/tools/hive/single-image/scripts/hadoop-bootstrap.sh b/tools/hive/single-image/scripts/hadoop-bootstrap.sh deleted file mode 100755 index 6b28b83a..00000000 --- a/tools/hive/single-image/scripts/hadoop-bootstrap.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -x - -# used by Hadoop -/usr/sbin/sshd - -: ${HADOOP_PREFIX:=/usr/local/hadoop} - -$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh - -rm /tmp/*.pid - -# installing libraries if any - (resource urls added comma separated to the ACP system variable) -#cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - - -# templating of config files -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml -sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/hdfs-site.xml.template > /usr/local/hadoop/etc/hadoop/hdfs-site.xml - -# format namenode -$HADOOP_PREFIX/bin/hdfs namenode -format - -# start hdfs and yarn -echo $JAVA_HOME -$HADOOP_PREFIX/sbin/start-dfs.sh -$HADOOP_PREFIX/sbin/start-yarn.sh - -if [[ $1 == "-bash" ]]; then - /bin/bash -fi diff --git a/tools/hive/single-image/templates/hdfs-site.xml.template b/tools/hive/single-image/templates/hdfs-site.xml.template deleted file mode 100644 index 82c525ea..00000000 --- a/tools/hive/single-image/templates/hdfs-site.xml.template +++ /dev/null @@ -1,6 +0,0 @@ - - - dfs.replication - 1 - - diff --git a/tools/hive/ubuntu/Dockerfile b/tools/hive/ubuntu/Dockerfile deleted file mode 100644 index 86ccc446..00000000 --- a/tools/hive/ubuntu/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -FROM ubuntu:16.04 - -USER root - -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle -ENV HIVE_HOME /usr/local/hive -ENV HADOOP_HOME /usr/local/hadoop - -ENV PATH $PATH:$JAVA_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME:$HADOOP_HOME/bin - -# install dev tools -RUN apt-get update -RUN apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 - -# for running sshd in ubuntu trusty. https://github.com/docker/docker/issues/5704 -RUN mkdir /var/run/sshd -RUN echo 'root:secretpasswd' | chpasswd -RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config - -# passwordless ssh -RUN yes | ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key -RUN yes | ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key -RUN yes | ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa -RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys - -# fix the 254 error code -RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config -RUN echo "UsePAM no" >> /etc/ssh/sshd_config -RUN echo "Port 2122" >> /etc/ssh/sshd_config -RUN /usr/sbin/sshd - -# ssh client config -ADD conf/ssh_config /root/.ssh/config -RUN chmod 600 /root/.ssh/config -RUN chown root:root /root/.ssh/config - -# oracle jdk 8 -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:webupd8team/java -RUN apt-get update - -# to accept license agreement automatically -RUN echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections -RUN echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections -RUN apt-get install -y oracle-java8-installer - -# java env setup -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle -ENV PATH $PATH:$JAVA_HOME/bin - -EXPOSE 22 diff --git a/tools/hive/ubuntu/base.env b/tools/hive/ubuntu/base.env deleted file mode 100644 index bf704a09..00000000 --- a/tools/hive/ubuntu/base.env +++ /dev/null @@ -1,19 +0,0 @@ -JAVA_HOME=/usr/lib/jvm/java-8-oracle -HIVE_HOME=/usr/local/hive -HIVE_CONF=/usr/local/hive/conf -HIVE_VERSION=1.1.0 -HADOOP_VERSION=2.6.0 -CDH_VERSION=5 -CDH_EXACT_VERSION=5.11.0 -POSTGRES_VERSION=9.5 -POSTGRESQL_MAIN=/var/lib/postgresql/9.5/main/ -POSTGRESQL_CONFIG_FILE=/var/lib/postgresql/9.5/main/postgresql.conf -POSTGRESQL_BIN=/usr/lib/postgresql/9.5/bin/postgres -PGPASSWORD=hive -HADOOP_HOME=/usr/local/hadoop -HADOOP_PREFIX=/usr/local/hadoop -HADOOP_COMMON_HOME=/usr/local/hadoop -HADOOP_HDFS_HOME=/usr/local/hadoop -HADOOP_MAPRED_HOME=/usr/local/hadoop -HADOOP_YARN_HOME=/usr/local/hadoop -HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop \ No newline at end of file diff --git a/tools/hive/ubuntu/conf/ssh_config b/tools/hive/ubuntu/conf/ssh_config deleted file mode 100644 index 535f9d32..00000000 --- a/tools/hive/ubuntu/conf/ssh_config +++ /dev/null @@ -1,5 +0,0 @@ -Host * - UserKnownHostsFile /dev/null - StrictHostKeyChecking no - LogLevel quiet - Port 2122 From 44572386b2e51b860c2ab6bbe67a390d8fcea8e5 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Wed, 22 Aug 2018 12:33:05 -0700 Subject: [PATCH 11/15] updated the README --- tools/hive/README.md | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/tools/hive/README.md b/tools/hive/README.md index e98905f4..e96622f1 100644 --- a/tools/hive/README.md +++ b/tools/hive/README.md @@ -3,33 +3,20 @@ This is a Hadoop Docker image running CDH5 versions of Hadoop and Hive, all in one container. There is a separate Kerberos image in which Hadoop and Hive use Kerberos for authentication. Adapted from https://github.com/tilakpatidar/cdh5_hive_postgres and based on Ubuntu (trusty). +Postgres is also installed so that Hive can use it for its Metastore backend and run in remote mode. + ## Current Version * Hadoop 2.6.0 +* Hive 1.1.0 ## Dependencies The Kerberos image assumes that a KDC has been launched by the dcos-commons kdc.py script. ## Build the image -Download dependencies: -``` -./download_deps.sh -``` - -Build the Ubuntu base image: -``` -cd ubuntu -docker build -t cdh5-ubuntu . -``` - -Build the Hadoop image: -``` -cd ../hadoop-2.6.0 -docker build -t cdh5-hadoop . -``` Build the Hadoop + Hive image: ``` -cd ../hive_pg +cd hadoop-hive docker build -t cdh5-hive . ``` From c42d82b46883bbcaed16864ddb87c0060f585b46 Mon Sep 17 00:00:00 2001 From: "Susan X. Huynh" Date: Wed, 22 Aug 2018 14:47:00 -0700 Subject: [PATCH 12/15] shell script to autogenerate the Kerberos Hadoop config files --- tools/hive/README.md | 10 +++++++-- .../hive/kerberos/scripts/generate_configs.sh | 16 ++++++++++++++ ...mplate => core-site-kerberos.xml.template} | 5 ----- ...mplate => hdfs-site-kerberos.xml.template} | 5 ----- ...mplate => hive-site-kerberos.xml.template} | 22 ------------------- ...mplate => yarn-site-kerberos.xml.template} | 0 6 files changed, 24 insertions(+), 34 deletions(-) create mode 100755 tools/hive/kerberos/scripts/generate_configs.sh rename tools/hive/kerberos/templates/{core-site.xml.template => core-site-kerberos.xml.template} (82%) rename tools/hive/kerberos/templates/{hdfs-site.xml.template => hdfs-site-kerberos.xml.template} (94%) rename tools/hive/kerberos/templates/{hive-site.xml.template => hive-site-kerberos.xml.template} (68%) rename tools/hive/kerberos/templates/{yarn-site.xml.template => yarn-site-kerberos.xml.template} (100%) diff --git a/tools/hive/README.md b/tools/hive/README.md index e96622f1..97d816f1 100644 --- a/tools/hive/README.md +++ b/tools/hive/README.md @@ -14,15 +14,21 @@ The Kerberos image assumes that a KDC has been launched by the dcos-commons kdc. ## Build the image -Build the Hadoop + Hive image: +### Build the Hadoop + Hive image: ``` cd hadoop-hive docker build -t cdh5-hive . ``` -Build the Kerberized Hadoop + Hive image: +### Build the Kerberized Hadoop + Hive image: +First, autogenerate the Hadoop config files. ``` cd ../kerberos +scripts/generate_configs.sh +``` + +Then build the image: +``` docker build -t cdh5-hive-kerberos . ``` diff --git a/tools/hive/kerberos/scripts/generate_configs.sh b/tools/hive/kerberos/scripts/generate_configs.sh new file mode 100755 index 00000000..c20629b0 --- /dev/null +++ b/tools/hive/kerberos/scripts/generate_configs.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Generate kerberos versions of Hadoop config files by taking the unkerberized versions +# and tacking on the associated kerberos properties. +# Assumption is that these two sets of properties do not overlap. + +cd "$( dirname "${BASH_SOURCE[0]}" )" +for FILE_BASE in core-site hdfs-site hive-site yarn-site; do + COMBINED_FILE="../templates/${FILE_BASE}.xml.template" + echo "Generating config file: kerberos/templates/${FILE_BASE}.xml.template" + echo '' > $COMBINED_FILE + grep -vh '' "../../hadoop-hive/templates/${FILE_BASE}.xml.template" >> $COMBINED_FILE + echo "" >> $COMBINED_FILE + grep -vh '' ../templates/${FILE_BASE}-kerberos.xml.template >> $COMBINED_FILE + echo '' >> $COMBINED_FILE +done diff --git a/tools/hive/kerberos/templates/core-site.xml.template b/tools/hive/kerberos/templates/core-site-kerberos.xml.template similarity index 82% rename from tools/hive/kerberos/templates/core-site.xml.template rename to tools/hive/kerberos/templates/core-site-kerberos.xml.template index 44933bbc..1f554fee 100644 --- a/tools/hive/kerberos/templates/core-site.xml.template +++ b/tools/hive/kerberos/templates/core-site-kerberos.xml.template @@ -1,9 +1,4 @@ - - fs.defaultFS - hdfs://{{HOSTNAME}}:9000 - - hadoop.security.authentication diff --git a/tools/hive/kerberos/templates/hdfs-site.xml.template b/tools/hive/kerberos/templates/hdfs-site-kerberos.xml.template similarity index 94% rename from tools/hive/kerberos/templates/hdfs-site.xml.template rename to tools/hive/kerberos/templates/hdfs-site-kerberos.xml.template index 03ec5444..06e19189 100644 --- a/tools/hive/kerberos/templates/hdfs-site.xml.template +++ b/tools/hive/kerberos/templates/hdfs-site-kerberos.xml.template @@ -1,9 +1,4 @@ - - dfs.replication - 1 - - dfs.block.access.token.enable diff --git a/tools/hive/kerberos/templates/hive-site.xml.template b/tools/hive/kerberos/templates/hive-site-kerberos.xml.template similarity index 68% rename from tools/hive/kerberos/templates/hive-site.xml.template rename to tools/hive/kerberos/templates/hive-site-kerberos.xml.template index b795077c..5dad8f1f 100755 --- a/tools/hive/kerberos/templates/hive-site.xml.template +++ b/tools/hive/kerberos/templates/hive-site-kerberos.xml.template @@ -1,26 +1,4 @@ - - - javax.jdo.option.ConnectionDriverName - org.postgresql.Driver - - - javax.jdo.option.ConnectionURL - jdbc:postgresql://localhost/metastore - - - javax.jdo.option.ConnectionUserName - hive - - - javax.jdo.option.ConnectionPassword - hive - - - hive.metastore.warehouse.dir - /apps/hive/warehouse - - hive.server2.authentication diff --git a/tools/hive/kerberos/templates/yarn-site.xml.template b/tools/hive/kerberos/templates/yarn-site-kerberos.xml.template similarity index 100% rename from tools/hive/kerberos/templates/yarn-site.xml.template rename to tools/hive/kerberos/templates/yarn-site-kerberos.xml.template From 7552c5c5d70ff82643b8e0c4268ffe1bd1659aed Mon Sep 17 00:00:00 2001 From: Sam Tran Date: Tue, 18 Sep 2018 15:49:52 -0700 Subject: [PATCH 13/15] Address Evan's comments --- tools/hive/hadoop-hive/Dockerfile | 54 ++++++++----------- .../hadoop-hive/scripts/hive-bootstrap.sh | 9 ++-- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/tools/hive/hadoop-hive/Dockerfile b/tools/hive/hadoop-hive/Dockerfile index c7a97d53..12f89bd7 100644 --- a/tools/hive/hadoop-hive/Dockerfile +++ b/tools/hive/hadoop-hive/Dockerfile @@ -2,22 +2,6 @@ FROM ubuntu:16.04 USER root -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle -ENV HADOOP_VERSION 2.6.0 -ENV CDH_VERSION 5 -ENV CDH_EXACT_VERSION 5.11.0 -ENV HADOOP_HOME /usr/local/hadoop -ENV HADOOP_PREFIX /usr/local/hadoop -ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop -ENV HIVE_HOME /usr/local/hive -ENV HIVE_CONF /usr/local/hive/conf -ENV HIVE_VERSION 1.1.0 -ENV POSTGRES_VERSION 9.5 -ENV POSTGRESQL_MAIN /var/lib/postgresql/9.5/main/ -ENV POSTGRESQL_CONFIG_FILE /var/lib/postgresql/9.5/main/postgresql.conf -ENV POSTGRESQL_BIN /usr/lib/postgresql/9.5/bin/postgres -ENV PGPASSWORD hive - # install dev tools RUN apt-get update && \ apt-get install -y curl wget tar openssh-server openssh-client rsync python-software-properties apt-file apache2 && \ @@ -62,6 +46,10 @@ RUN apt-get update && \ ENV JAVA_HOME /usr/lib/jvm/java-8-oracle ENV PATH $PATH:$JAVA_HOME/bin +ENV CDH_VERSION 5 +ENV CDH_EXACT_VERSION 5.11.0 +ENV HADOOP_VERSION 2.6.0 + # download cdh hadoop RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz \ | tar -xzC /usr/local && \ @@ -69,6 +57,7 @@ RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/had ln -s ./hadoop-${HADOOP_VERSION}-cdh${CDH_EXACT_VERSION} hadoop # need to define JAVA_HOME inside hadoop-env.sh +ENV HADOOP_PREFIX /usr/local/hadoop RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh # pseudo distributed configurations of hadoop @@ -85,6 +74,7 @@ RUN chmod 700 /etc/hadoop-bootstrap.sh RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh # add hadoop to path +ENV HADOOP_HOME /usr/local/hadoop ENV PATH $PATH:$HADOOP_HOME:$HADOOP_HOME/bin #for exposed ports refer @@ -92,16 +82,11 @@ ENV PATH $PATH:$HADOOP_HOME:$HADOOP_HOME/bin EXPOSE 50010 50020 50070 50075 50090 8020 9000 10020 19888 8030 8031 8032 8033 8040 8042 8088 # download cdh hive -RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-1.1.0-cdh${CDH_EXACT_VERSION}.tar.gz \ +ENV HIVE_VERSION 1.1.0 +RUN curl -L http://archive.cloudera.com/cdh${CDH_VERSION}/cdh/${CDH_VERSION}/hive-${HIVE_VERSION}-cdh${CDH_EXACT_VERSION}.tar.gz \ | tar -xzC /usr/local && \ cd /usr/local && \ - mv hive-1.1.0-cdh${CDH_EXACT_VERSION} hive - -# add hive to path -ENV PATH $PATH:$HIVE_HOME/bin - -# add postgresql jdbc jar to classpath -RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar + mv hive-${HIVE_VERSION}-cdh${CDH_EXACT_VERSION} hive # to configure postgres as hive metastore backend RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" >> /etc/apt/sources.list.d/pgdg.list' @@ -110,31 +95,38 @@ RUN apt-get update -y && \ apt-get -yq install vim postgresql-9.5 libpostgresql-jdbc-java && \ rm -rf /var/lib/apt/lists/* +# add hive to path +ENV HIVE_HOME /usr/local/hive +ENV PATH $PATH:$HIVE_HOME/bin + +# add postgresql jdbc jar to classpath +RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar + USER postgres # initialize hive metastore db # create metastore db, hive user and assign privileges -RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ &&\ - /etc/init.d/postgresql start &&\ - psql --command "CREATE DATABASE metastore;" &&\ +RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ && \ + /etc/init.d/postgresql start && \ + psql --command "CREATE DATABASE metastore;" && \ psql --command "CREATE USER hive WITH PASSWORD 'hive';" && \ psql --command "ALTER USER hive WITH SUPERUSER;" && \ psql --command "GRANT ALL PRIVILEGES ON DATABASE metastore TO hive;" && \ psql -U hive -d metastore -h localhost -f hive-schema-${HIVE_VERSION}.postgres.sql - # revert back to default user USER root # disable ssl in postgres.conf +ENV POSTGRES_VERSION 9.5 +ENV POSTGRESQL_CONFIG_FILE /var/lib/postgresql/${POSTGRES_VERSION}/main/postgresql.conf +ENV POSTGRESQL_MAIN /var/lib/postgresql/9.5/main/ ADD conf/postgresql.conf $POSTGRESQL_MAIN -RUN echo $POSTGRESQL_MAIN -RUN echo $POSTGRESQL_CONFIG_FILE RUN chown postgres:postgres $POSTGRESQL_CONFIG_FILE RUN sed -i -e 's/peer/md5/g' /etc/postgresql/$POSTGRES_VERSION/main/pg_hba.conf # copy config, sql, data files to /opt/files +ENV HIVE_CONF /usr/local/hive/conf RUN mkdir /opt/files -RUN echo $HIVE_CONF ADD templates/hive-site.xml.template /opt/files/ ADD templates/hive-site.xml.template $HIVE_CONF/hive-site.xml.template diff --git a/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh b/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh index 4d40308f..5f4f2d58 100644 --- a/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh +++ b/tools/hive/hadoop-hive/scripts/hive-bootstrap.sh @@ -4,7 +4,7 @@ set -x printenv | cat >> /root/.bashrc # hadoop bootstrap -/etc/hadoop-bootstrap.sh -d +/etc/hadoop-bootstrap.sh # restart postgresql /etc/init.d/postgresql restart @@ -42,8 +42,9 @@ $HIVE_HOME/bin/hive --service hiveserver2 & if [[ $1 == "-bash" ]]; then /bin/bash -fi - -if [[ $1 == "-d" ]]; then +elif [[ $1 == "-d" ]]; then while true; do sleep 10000; done +else + echo "Unknown argument $1" + echo "Usage: ./hive-bootstrap.sh [ -bash | -d ]" fi From fed868e9193395b61845c3d4196d1f5ff454846e Mon Sep 17 00:00:00 2001 From: Sam Tran Date: Wed, 19 Sep 2018 09:33:56 -0700 Subject: [PATCH 14/15] Cleanup postgresql.conf --- tools/hive/hadoop-hive/conf/postgresql.conf | 610 +------------------- 1 file changed, 12 insertions(+), 598 deletions(-) diff --git a/tools/hive/hadoop-hive/conf/postgresql.conf b/tools/hive/hadoop-hive/conf/postgresql.conf index 12b8314a..ba4c9bc3 100644 --- a/tools/hive/hadoop-hive/conf/postgresql.conf +++ b/tools/hive/hadoop-hive/conf/postgresql.conf @@ -2,629 +2,43 @@ # PostgreSQL configuration file # ----------------------------- # -# This file consists of lines of the form: -# -# name = value -# -# (The "=" is optional.) Whitespace may be used. Comments are introduced with -# "#" anywhere on a line. The complete list of parameter names and allowed -# values can be found in the PostgreSQL documentation. -# -# The commented-out settings shown in this file represent the default values. -# Re-commenting a setting is NOT sufficient to revert it to the default value; -# you need to reload the server. -# -# This file is read on server startup and when the server receives a SIGHUP -# signal. If you edit the file on a running system, you have to SIGHUP the -# server for the changes to take effect, or use "pg_ctl reload". Some -# parameters, which are marked below, require a server shutdown and restart to -# take effect. -# -# Any parameter can also be given as a command-line option to the server, e.g., -# "postgres -c log_connections=on". Some parameters can be changed at run time -# with the "SET" SQL command. -# -# Memory units: kB = kilobytes Time units: ms = milliseconds -# MB = megabytes s = seconds -# GB = gigabytes min = minutes -# TB = terabytes h = hours -# d = days - #------------------------------------------------------------------------------ # FILE LOCATIONS #------------------------------------------------------------------------------ -# The default values of these variables are driven from the -D command-line -# option or PGDATA environment variable, represented here as ConfigDir. - -data_directory = '/var/lib/postgresql/9.5/main' # use data in another directory -# (change requires restart) -hba_file = '/var/lib/postgresql/9.5/pg_hba.conf' # host-based authentication file -# (change requires restart) +data_directory = '/var/lib/postgresql/9.5/main' # use data in another directory +hba_file = '/var/lib/postgresql/9.5/pg_hba.conf' # host-based authentication file ident_file = '/var/lib/postgresql/9.5/pg_ident.conf' # ident configuration file -# (change requires restart) - -# If external_pid_file is not explicitly set, no extra PID file is written. -#external_pid_file = '' # write an extra PID file -# (change requires restart) - #------------------------------------------------------------------------------ # CONNECTIONS AND AUTHENTICATION #------------------------------------------------------------------------------ # - Connection Settings - - listen_addresses = '*' # what IP address(es) to listen on; -# comma-separated list of addresses; -# defaults to 'localhost'; use '*' for all -# (change requires restart) -port = 5432 # (change requires restart) -max_connections = 100 # (change requires restart) -# Note: Increasing max_connections costs ~400 bytes of shared memory per -# connection slot, plus lock space (see max_locks_per_transaction). -#superuser_reserved_connections = 3 # (change requires restart) -#unix_socket_directories = '/tmp' # comma-separated list of directories -# (change requires restart) -#unix_socket_group = '' # (change requires restart) -#unix_socket_permissions = 0777 # begin with 0 to use octal notation -# (change requires restart) -#bonjour = off # advertise server via Bonjour -# (change requires restart) -#bonjour_name = '' # defaults to the computer name -# (change requires restart) +port = 5432 +max_connections = 100 # - Security and Authentication - - -#authentication_timeout = 1min # 1s-600s -ssl = off # (change requires restart) -#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers -# (change requires restart) -#ssl_prefer_server_ciphers = on # (change requires restart) -#ssl_ecdh_curve = 'prime256v1' # (change requires restart) -#ssl_cert_file = 'server.crt' # (change requires restart) -#ssl_key_file = 'server.key' # (change requires restart) -#ssl_ca_file = '' # (change requires restart) -#ssl_crl_file = '' # (change requires restart) -#password_encryption = on -#db_user_namespace = off -#row_security = on - -# GSSAPI using Kerberos -#krb_server_keyfile = '' -#krb_caseins_users = off - -# - TCP Keepalives - -# see "man 7 tcp" for details - -#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; -# 0 selects the system default -#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; -# 0 selects the system default -#tcp_keepalives_count = 0 # TCP_KEEPCNT; -# 0 selects the system default - +ssl = off #------------------------------------------------------------------------------ # RESOURCE USAGE (except WAL) #------------------------------------------------------------------------------ # - Memory - - shared_buffers = 128MB # min 128kB -# (change requires restart) -#huge_pages = try # on, off, or try -# (change requires restart) -#temp_buffers = 8MB # min 800kB -#max_prepared_transactions = 0 # zero disables the feature -# (change requires restart) -# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory -# per transaction slot, plus lock space (see max_locks_per_transaction). -# It is not advisable to set max_prepared_transactions nonzero unless you -# actively intend to use prepared transactions. -#work_mem = 4MB # min 64kB -#maintenance_work_mem = 64MB # min 1MB -#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem -#max_stack_depth = 2MB # min 100kB -#dynamic_shared_memory_type = posix # the default is the first option -# supported by the operating system: -# posix -# sysv -# windows -# mmap -# use none to disable dynamic shared memory - -# - Disk - - -#temp_file_limit = -1 # limits per-session temp file space -# in kB, or -1 for no limit - -# - Kernel Resource Usage - - -#max_files_per_process = 1000 # min 25 -# (change requires restart) -#shared_preload_libraries = '' # (change requires restart) - -# - Cost-Based Vacuum Delay - - -#vacuum_cost_delay = 0 # 0-100 milliseconds -#vacuum_cost_page_hit = 1 # 0-10000 credits -#vacuum_cost_page_miss = 10 # 0-10000 credits -#vacuum_cost_page_dirty = 20 # 0-10000 credits -#vacuum_cost_limit = 200 # 1-10000 credits - -# - Background Writer - - -#bgwriter_delay = 200ms # 10-10000ms between rounds -#bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round -#bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round - -# - Asynchronous Behavior - - -#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching -#max_worker_processes = 8 - - -#------------------------------------------------------------------------------ -# WRITE AHEAD LOG -#------------------------------------------------------------------------------ - -# - Settings - - -#wal_level = minimal # minimal, archive, hot_standby, or logical -# (change requires restart) -#fsync = on # turns forced synchronization on or off -#synchronous_commit = on # synchronization level; -# off, local, remote_write, or on -#wal_sync_method = fsync # the default is the first option -# supported by the operating system: -# open_datasync -# fdatasync (default on Linux) -# fsync -# fsync_writethrough -# open_sync -#full_page_writes = on # recover from partial page writes -#wal_compression = off # enable compression of full-page writes -#wal_log_hints = off # also do full page writes of non-critical updates -# (change requires restart) -#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers -# (change requires restart) -#wal_writer_delay = 200ms # 1-10000 milliseconds - -#commit_delay = 0 # range 0-100000, in microseconds -#commit_siblings = 5 # range 1-1000 - -# - Checkpoints - - -#checkpoint_timeout = 5min # range 30s-1h -#max_wal_size = 1GB -#min_wal_size = 80MB -#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 -#checkpoint_warning = 30s # 0 disables - -# - Archiving - - -#archive_mode = off # enables archiving; off, on, or always -# (change requires restart) -#archive_command = '' # command to use to archive a logfile segment -# placeholders: %p = path of file to archive -# %f = file name only -# e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' -#archive_timeout = 0 # force a logfile segment switch after this -# number of seconds; 0 disables - - -#------------------------------------------------------------------------------ -# REPLICATION -#------------------------------------------------------------------------------ - -# - Sending Server(s) - - -# Set these on the master and on any standby that will send replication data. - -#max_wal_senders = 0 # max number of walsender processes -# (change requires restart) -#wal_keep_segments = 0 # in logfile segments, 16MB each; 0 disables -#wal_sender_timeout = 60s # in milliseconds; 0 disables - -#max_replication_slots = 0 # max number of replication slots -# (change requires restart) -#track_commit_timestamp = off # collect timestamp of transaction commit -# (change requires restart) - -# - Master Server - - -# These settings are ignored on a standby server. - -#synchronous_standby_names = '' # standby servers that provide sync rep -# comma-separated list of application_name -# from standby(s); '*' = all -#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed - -# - Standby Servers - - -# These settings are ignored on a master server. - -#hot_standby = off # "on" allows queries during recovery -# (change requires restart) -#max_standby_archive_delay = 30s # max delay before canceling queries -# when reading WAL from archive; -# -1 allows indefinite delay -#max_standby_streaming_delay = 30s # max delay before canceling queries -# when reading streaming WAL; -# -1 allows indefinite delay -#wal_receiver_status_interval = 10s # send replies at least this often -# 0 disables -#hot_standby_feedback = off # send info from standby to prevent -# query conflicts -#wal_receiver_timeout = 60s # time that receiver waits for -# communication from master -# in milliseconds; 0 disables -#wal_retrieve_retry_interval = 5s # time to wait before retrying to -# retrieve WAL after a failed attempt - - -#------------------------------------------------------------------------------ -# QUERY TUNING -#------------------------------------------------------------------------------ - -# - Planner Method Configuration - - -#enable_bitmapscan = on -#enable_hashagg = on -#enable_hashjoin = on -#enable_indexscan = on -#enable_indexonlyscan = on -#enable_material = on -#enable_mergejoin = on -#enable_nestloop = on -#enable_seqscan = on -#enable_sort = on -#enable_tidscan = on - -# - Planner Cost Constants - - -#seq_page_cost = 1.0 # measured on an arbitrary scale -#random_page_cost = 4.0 # same scale as above -#cpu_tuple_cost = 0.01 # same scale as above -#cpu_index_tuple_cost = 0.005 # same scale as above -#cpu_operator_cost = 0.0025 # same scale as above -#effective_cache_size = 4GB - -# - Genetic Query Optimizer - - -#geqo = on -#geqo_threshold = 12 -#geqo_effort = 5 # range 1-10 -#geqo_pool_size = 0 # selects default based on effort -#geqo_generations = 0 # selects default based on effort -#geqo_selection_bias = 2.0 # range 1.5-2.0 -#geqo_seed = 0.0 # range 0.0-1.0 - -# - Other Planner Options - - -#default_statistics_target = 100 # range 1-10000 -#constraint_exclusion = partition # on, off, or partition -#cursor_tuple_fraction = 0.1 # range 0.0-1.0 -#from_collapse_limit = 8 -#join_collapse_limit = 8 # 1 disables collapsing of explicit -# JOIN clauses - #------------------------------------------------------------------------------ # ERROR REPORTING AND LOGGING #------------------------------------------------------------------------------ # - Where to Log - - -log_destination = 'stderr' # Valid values are combinations of -# stderr, csvlog, syslog, and eventlog, -# depending on platform. csvlog -# requires logging_collector to be on. - -# This is used when logging to stderr: -logging_collector = on # Enable capturing of stderr and csvlog -# into log files. Required to be on for -# csvlogs. -# (change requires restart) - -# These are only used if logging_collector is on: -log_directory = 'pg_log' # directory where log files are written, -# can be absolute or relative to PGDATA -log_filename = 'postgresql-%a.log' # log file name pattern, -# can include strftime() escapes -#log_file_mode = 0600 # creation mode for log files, -# begin with 0 to use octal notation -log_truncate_on_rotation = on # If on, an existing log file with the -# same name as the new log file will be -# truncated rather than appended to. -# But such truncation only occurs on -# time-driven rotation, not on restarts -# or size-driven rotation. Default is -# off, meaning append to existing files -# in all cases. -log_rotation_age = 1d # Automatic rotation of logfiles will -# happen after that time. 0 disables. -log_rotation_size = 0 # Automatic rotation of logfiles will -# happen after that much log output. -# 0 disables. - -# These are relevant when logging to syslog: -#syslog_facility = 'LOCAL0' -#syslog_ident = 'postgres' - -# This is only relevant when logging to eventlog (win32): -#event_source = 'PostgreSQL' - -# - When to Log - - -#client_min_messages = notice # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# log -# notice -# warning -# error - -#log_min_messages = warning # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# info -# notice -# warning -# error -# log -# fatal -# panic - -#log_min_error_statement = error # values in order of decreasing detail: -# debug5 -# debug4 -# debug3 -# debug2 -# debug1 -# info -# notice -# warning -# error -# log -# fatal -# panic (effectively off) - -#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements -# and their durations, > 0 logs only -# statements running at least this number -# of milliseconds - - -# - What to Log - - -#debug_print_parse = off -#debug_print_rewritten = off -#debug_print_plan = off -#debug_pretty_print = on -#log_checkpoints = off -#log_connections = off -#log_disconnections = off -#log_duration = off -#log_error_verbosity = default # terse, default, or verbose messages -#log_hostname = off -#log_line_prefix = '< %m >' # special values: -# %a = application name -# %u = user name -# %d = database name -# %r = remote host and port -# %h = remote host -# %p = process ID -# %t = timestamp without milliseconds -# %m = timestamp with milliseconds -# %i = command tag -# %e = SQL state -# %c = session ID -# %l = session line number -# %s = session start timestamp -# %v = virtual transaction ID -# %x = transaction ID (0 if none) -# %q = stop here in non-session -# processes -# %% = '%' -# e.g. '<%u%%%d> ' -#log_lock_waits = off # log lock waits >= deadlock_timeout -#log_statement = 'none' # none, ddl, mod, all -#log_replication_commands = off -#log_temp_files = -1 # log temporary files equal or larger -# than the specified size in kilobytes; -# -1 disables, 0 logs all temp files -#log_timezone = 'GMT' - - -# - Process Title - - -#cluster_name = '' # added to process titles if nonempty -# (change requires restart) -#update_process_title = on - - -#------------------------------------------------------------------------------ -# RUNTIME STATISTICS -#------------------------------------------------------------------------------ - -# - Query/Index Statistics Collector - - -#track_activities = on -#track_counts = on -#track_io_timing = off -#track_functions = none # none, pl, all -#track_activity_query_size = 1024 # (change requires restart) -#stats_temp_directory = 'pg_stat_tmp' - - -# - Statistics Monitoring - - -#log_parser_stats = off -#log_planner_stats = off -#log_executor_stats = off -#log_statement_stats = off - - -#------------------------------------------------------------------------------ -# AUTOVACUUM PARAMETERS -#------------------------------------------------------------------------------ - -#autovacuum = on # Enable autovacuum subprocess? 'on' -# requires track_counts to also be on. -#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and -# their durations, > 0 logs only -# actions running at least this number -# of milliseconds. -#autovacuum_max_workers = 3 # max number of autovacuum subprocesses -# (change requires restart) -#autovacuum_naptime = 1min # time between autovacuum runs -#autovacuum_vacuum_threshold = 50 # min number of row updates before -# vacuum -#autovacuum_analyze_threshold = 50 # min number of row updates before -# analyze -#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum -#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze -#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum -# (change requires restart) -#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age -# before forced vacuum -# (change requires restart) -#autovacuum_vacuum_cost_delay = 20ms # default vacuum cost delay for -# autovacuum, in milliseconds; -# -1 means use vacuum_cost_delay -#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for -# autovacuum, -1 means use -# vacuum_cost_limit - - -#------------------------------------------------------------------------------ -# CLIENT CONNECTION DEFAULTS -#------------------------------------------------------------------------------ - -# - Statement Behavior - - -#search_path = '"$user", public' # schema names -#default_tablespace = '' # a tablespace name, '' uses the default -#temp_tablespaces = '' # a list of tablespace names, '' uses -# only default tablespace -#check_function_bodies = on -#default_transaction_isolation = 'read committed' -#default_transaction_read_only = off -#default_transaction_deferrable = off -#session_replication_role = 'origin' -#statement_timeout = 0 # in milliseconds, 0 is disabled -#lock_timeout = 0 # in milliseconds, 0 is disabled -#vacuum_freeze_min_age = 50000000 -#vacuum_freeze_table_age = 150000000 -#vacuum_multixact_freeze_min_age = 5000000 -#vacuum_multixact_freeze_table_age = 150000000 -#bytea_output = 'hex' # hex, escape -#xmlbinary = 'base64' -#xmloption = 'content' -#gin_fuzzy_search_limit = 0 -#gin_pending_list_limit = 4MB - -# - Locale and Formatting - - -#datestyle = 'iso, mdy' -#intervalstyle = 'postgres' -#timezone = 'GMT' -#timezone_abbreviations = 'Default' # Select the set of available time zone -# abbreviations. Currently, there are -# Default -# Australia (historical usage) -# India -# You can create your own file in -# share/timezonesets/. -#extra_float_digits = 0 # min -15, max 3 -#client_encoding = sql_ascii # actually, defaults to database -# encoding - -# These settings are initialized by initdb, but they can be changed. -#lc_messages = 'C' # locale for system error message -# strings -#lc_monetary = 'C' # locale for monetary formatting -#lc_numeric = 'C' # locale for number formatting -#lc_time = 'C' # locale for time formatting - -# default configuration for text search -#default_text_search_config = 'pg_catalog.simple' - -# - Other Defaults - - -#dynamic_library_path = '$libdir' -#local_preload_libraries = '' -#session_preload_libraries = '' - - -#------------------------------------------------------------------------------ -# LOCK MANAGEMENT -#------------------------------------------------------------------------------ - -#deadlock_timeout = 1s -#max_locks_per_transaction = 64 # min 10 -# (change requires restart) -# Note: Each lock table slot uses ~270 bytes of shared memory, and there are -# max_locks_per_transaction * (max_connections + max_prepared_transactions) -# lock table slots. -#max_pred_locks_per_transaction = 64 # min 10 -# (change requires restart) - - -#------------------------------------------------------------------------------ -# VERSION/PLATFORM COMPATIBILITY -#------------------------------------------------------------------------------ - -# - Previous PostgreSQL Versions - - -#array_nulls = on -#backslash_quote = safe_encoding # on, off, or safe_encoding -#default_with_oids = off -#escape_string_warning = on -#lo_compat_privileges = off -#operator_precedence_warning = off -#quote_all_identifiers = off -#sql_inheritance = on -#standard_conforming_strings = on -#synchronize_seqscans = on - -# - Other Platforms and Clients - - -#transform_null_equals = off - - -#------------------------------------------------------------------------------ -# ERROR HANDLING -#------------------------------------------------------------------------------ - -#exit_on_error = off # terminate session on any error? -#restart_after_crash = on # reinitialize after backend crash? - - -#------------------------------------------------------------------------------ -# CONFIG FILE INCLUDES -#------------------------------------------------------------------------------ - -# These options allow settings to be loaded from files other than the -# default postgresql.conf. - -#include_dir = 'conf.d' # include files ending in '.conf' from -# directory 'conf.d' -#include_if_exists = 'exists.conf' # include file only if it exists -#include = 'special.conf' # include file - - -#------------------------------------------------------------------------------ -# CUSTOMIZED OPTIONS -#------------------------------------------------------------------------------ - -# Add settings for extensions here +log_destination = 'stderr' # Valid values are combinations of +logging_collector = on # Enable capturing of stderr and csvlog +log_directory = 'pg_log' # directory where log files are written, +log_filename = 'postgresql-%a.log' # log file name pattern, +log_truncate_on_rotation = on +log_rotation_age = 1d +log_rotation_size = 0 From 27a0a4dd5fb9c63a19b568dbbdabfa2084947481 Mon Sep 17 00:00:00 2001 From: Sam Tran Date: Wed, 19 Sep 2018 09:58:21 -0700 Subject: [PATCH 15/15] Add back pg pw envvar --- tools/hive/hadoop-hive/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/hive/hadoop-hive/Dockerfile b/tools/hive/hadoop-hive/Dockerfile index 12f89bd7..e19c4903 100644 --- a/tools/hive/hadoop-hive/Dockerfile +++ b/tools/hive/hadoop-hive/Dockerfile @@ -102,7 +102,9 @@ ENV PATH $PATH:$HIVE_HOME/bin # add postgresql jdbc jar to classpath RUN ln -s /usr/share/java/postgresql-jdbc4.jar $HIVE_HOME/lib/postgresql-jdbc4.jar +ENV PGPASSWORD hive USER postgres + # initialize hive metastore db # create metastore db, hive user and assign privileges RUN cd $HIVE_HOME/scripts/metastore/upgrade/postgres/ && \