diff --git a/hadoop/README.md b/hadoop/README.md new file mode 100644 index 0000000000000000000000000000000000000000..70a4873795d8b26aa8890eaa4a9e8f7845461d5c --- /dev/null +++ b/hadoop/README.md @@ -0,0 +1,244 @@ +# Hadoop Docker + +This repository contains *Dockerfile*s for setting up a basic Hadoop cluster. +The available components are: + +1. *HDFS*: + + * *namenode* + * *dtanode* + +1. *YARN*: + + * *resourcemanager* + * *nodemanager* + +1. *Spark submitter* + +All images inherit from a base *hadoop* image which provides an hadoop +installation in `/opt/` and provides a way to configure *hadoop* via +environment variables. + +## Hadoop configuration + +The *hadoop* configuration is controlled via the following environment +variable groups: + +1. `CORE_CONF`: affects `/etc/hadoop/core-site.xml` +1. `HDFS_CONF`: affects `/etc/hadoop/hdfs-site.xml` +1. `YARN_CONF`: affects `/etc/hadoop/yarn-site.xml` +1. `HTTPFS_CONF`: affects `/etc/hadoop/httpfs-site.xml` +1. `KMS_CONF`: affects `/etc/hadoop/KMS-site.xml` + +*Hadoop* properties by setting an environment variable with the +appropriated prefix in the form `_`. + +Due to restriction imposed by `docker` and `docker-compose` on +environment variable names the following substitution are applied to +property names: + +* `_` => `.` +* `__` => `_` +* `___` => `-` + +Following are some illustratory examples: + +* `CORE_CONF_fs_defaultFS`: sets the *fs.defaultFS* property in +`core-site.xml` +* `YARN_CONF_yarn_log___aggregation___enable`: sets the + *yarn.log-aggregation-enable* property in `yarn-site.xml` + + + +## Hadoop configuration presets + +Furthermore the following special environment variables control +configurations presets: + +* `MULTIHOMED_NETWORK`: configure the *hadoop* cluster in such a way + to be reachable from multiple networks, specifically the following + properties are set: + + In `/etc/hadoop/hdfs-site.xml`: + + * dfs.namenode.rpc-bind-host = 0.0.0.0 + * dfs.namenode.servicerpc-bind-host = 0.0.0.0 + * dfs.namenode.http-bind-host = 0.0.0.0 + * dfs.namenode.https-bind-host = 0.0.0.0 + * dfs.client.use.datanode.hostname = true + * dfs.datanode.use.datanode.hostname = true + + In `/etc/hadoop/yarn-site.xml`: + + * yarn.resourcemanager.bind-host = 0.0.0.0 + * yarn.nodemanager.bind-host = 0.0.0.0 + * yarn.nodemanager.bind-host = 0.0.0.0 + + In `/etc/hadoop/mapred-site.xml`: + + * yarn.nodemanager.bind-host = 0.0.0.0 + +* `GANGLIA_HOST`: instruct *hadoop* to send metrics to the specified + *ganglia gmond* daemon (requires a unicast ganglia configuration) + +## Networking + +In order for things to run smoothly it's reccomended to exploit the +new networking infrastructure of *docker* 1.9. Create a dedicated +*network* for the cluster to run on. + +Furthermore is useful to fix the container *name* and the container +*hostname* to the same value. This way every container will able to +resolve itself with the same name as other container. + +Lastly is useful to set the *domainname* equal to the name of the +*network* and use *FQDN* to reference the various services. + +With the specified setup is possible you'll be able to access the +web-interfaces of the various components without the annoying problem +of unresolved links (provided that you setup a dns solution to resolve +container names and configure static routing if using +*docker-machine*). + +## Components + +### namenode + +The *hadoop-namenode* image starts an Hadoop NameNode. (single instance) + +Additional environment variables: + +* `CLUSTER_NAME`: name of the *HDFS* cluster (used during the initial +formatting) + +Volumes: + +* `/hadoop/dfs/name`: *HDFS* filesystem name directory + +Mandatory configuration: + +* `CLUSTER_NAME`: cluster name + +*Docker-compose* template: + + namenode: + image: uhopper/hadoop-namenode + hostname: namenode + container_name: namenode + domainname: hadoop + net: hadoop + volumes: + - :/hadoop/dfs/name + environment: + - GANGLIA_HOST= + - CLUSTER_NAME= + +Once running you can connect to `http://:50070` to see +the webui. + +### datanode + +The *hadoop-datanode* image starts an Hadoop DataNode. (multiple instances) + +Volumes: + +* `/hadoop/dfs/data`: *HDFS* filesystem data directory + +Mandatory configuration: + +* `CORE_CONF_fs_defaultFS`: *HDFS* address (i.e. `hdfs://:8020`) + +*Docker-compose* template: + + datanode1: + image: uhopper/hadoop-datanode + hostname: datanode1 + container_name: datanode1 + domainname: hadoop + net: hadoop + volumes: + - :/hadoop/dfs/data + environment: + - GANGLIA_HOST= + - CORE_CONF_fs_defaultFS=hdfs://:8020 + +### resourcemanager + +The *hadoop-resourcemanager* image starts an Hadoop +ResourceManager. (single instance) + +Mandatory configuration: + +* `CORE_CONF_fs_defaultFS`: *HDFS* address (i.e. `hdfs://:8020`) + +*Docker-compose* template: + + resourcemanager: + image: uhopper/hadoop-resourcemanager + hostname: resourcemanager + container_name: resourcemanager + domainname: hadoop + net: hadoop + environment: + - GANGLIA_HOST= + - CORE_CONF_fs_defaultFS=hdfs://:8020 + - YARN_CONF_yarn_log___aggregation___enable=true + +Once running you can connect to `http://:8088` to see +the webui. + +### nodemanager + +The *hadoop-nodemanager* image starts an Hadoop NodeManager. (multiple +instances) + +Mandatory configuration: + +* `CORE_CONF_fs_defaultFS`: *HDFS* address (i.e. `hdfs://:8020`) +* `YARN_CONF_yarn_resourcemanager_hostname`: *resourcemanager* host + +*Docker-compose* template: + + nodemanager1: + image: uhopper/hadoop-nodemanager + hostname: nodemanager1 + container_name: nodemanager1 + domainname: hadoop + net: hadoop + environment: + - GANGLIA_HOST= + - CORE_CONF_fs_defaultFS=hdfs://:8020 + - YARN_CONF_yarn_resourcemanager_hostname= + - YARN_CONF_yarn_log___aggregation___enable=true + - YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs + +### spark + +The *hadoop-spark* image is an utility container which provides a +Spark environment configured for the *hadoop* cluster. + +The image itself doesn't specify any command since no service are +exposed. You are expected to specify it yourself via `docker run +uhopper/hadoop-spark `. + +A common approach is to keep the container alive using `tail -f +/var/log/dmesg` as command and then connect to it via `docker exec -ti +spark bash` to have a *spark* environment. + +Mandatory configuration: + +* `CORE_CONF_fs_defaultFS`: *HDFS* address (i.e. `hdfs://:8020`) +* `YARN_CONF_yarn_resourcemanager_hostname`: *resourcemanager* host + +*Docker-compose* template: + + spark: + image: uhopper/hadoop-spark + hostname: spark + container_name: spark + domainname: hadoop + net: hadoop + environment: + - CORE_CONF_fs_defaultFS=hdfs://namenode:8020 + - YARN_CONF_yarn_resourcemanager_hostname=resourcemanager + command: tail -f /var/log/dmesg diff --git a/hadoop/buildall.sh b/hadoop/buildall.sh new file mode 100755 index 0000000000000000000000000000000000000000..2fadca3170c180a5af8df4c7dd4ec3c4086800c8 --- /dev/null +++ b/hadoop/buildall.sh @@ -0,0 +1,22 @@ +#!/bin/sh +set -e + +if [ $# -gt 0 ]; then + HADOOP_VERSION=$1 + if [ $# -gt 1 ]; then + HADOOP_TAG=$2 + else + HADOOP_TAG=${HADOOP_TAG:-"latest"} + fi +fi + +if [ -z ${HADOOP_VERSION+x} ]; then + echo "Must define HADOOP_VERSION enviroment variable, or pass as first argument" + exit 1 +fi + +for i in hadoop namenode datanode resourcemanager nodemanager historyserver spark; do + echo Building $i + [ "$i" = "hadoop" ] && name="hadoop" || name="hadoop-$i" + ( cd $i && docker build --build-arg HADOOP_VERSION=$HADOOP_VERSION --build-arg HADOOP_TAG=$HADOOP_TAG -t registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/$name:$HADOOP_TAG . ) +done diff --git a/hadoop/datanode/Dockerfile b/hadoop/datanode/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..310f10793e1345bc9284a398d3eb5b9d9f597f6d --- /dev/null +++ b/hadoop/datanode/Dockerfile @@ -0,0 +1,11 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data +RUN mkdir -p /hadoop/dfs/data +VOLUME /hadoop/dfs/data + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/datanode/hooks/build b/hadoop/datanode/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/datanode/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/datanode/run.sh b/hadoop/datanode/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9f57ee254834f73d5c3245494a5606a73030413c --- /dev/null +++ b/hadoop/datanode/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` +if [ ! -d $datadir ]; then + echo "Datanode data directory not found: $datadir" + exit 2 +fi + +$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode diff --git a/hadoop/hadoop.env b/hadoop/hadoop.env new file mode 100644 index 0000000000000000000000000000000000000000..107ff09252e89cf678243a7f6d060596d7e3041c --- /dev/null +++ b/hadoop/hadoop.env @@ -0,0 +1,19 @@ +GANGLIA_HOST=ganglia.hadoop + +CORE_CONF_fs_defaultFS=hdfs://namenode.hadoop:8020 +CORE_CONF_hadoop_http_staticuser_user=root + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs + +YARN_CONF_yarn_log_server_url=http://historyserver.hadoop:8188/applicationhistory/logs/ +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true + + +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager.hadoop +YARN_CONF_yarn_timeline___service_hostname=historyserver.hadoop \ No newline at end of file diff --git a/hadoop/hadoop/Dockerfile b/hadoop/hadoop/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c27d8caf7439c1b6a0af39a51a140fd6eff76064 --- /dev/null +++ b/hadoop/hadoop/Dockerfile @@ -0,0 +1,35 @@ +FROM debian:stretch-backports + +COPY sources.list /etc/apt/sources.list + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -t stretch-backports --no-install-recommends openjdk-8-jre-headless ca-certificates-java + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends net-tools curl gnupg \ + && rm -rf /var/lib/apt/lists/* + +ADD functions.sh /functions.sh + +ARG HADOOP_VERSION +ENV HADOOP_VERSION ${HADOOP_VERSION} +RUN . /functions.sh \ + && apache_install hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz hadoop/common/KEYS \ + && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \ + #&& cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \ + && mkdir /opt/hadoop-$HADOOP_VERSION/logs \ + && mkdir /hadoop-data \ + && rm -Rf /opt/hadoop-$HADOOP_VERSION/share/doc/hadoop + +ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION +ENV HADOOP_CONF_DIR=/etc/hadoop +ENV MULTIHOMED_NETWORK=1 + +ENV USER=root +ENV PATH $HADOOP_PREFIX/bin/:$PATH + +ADD entrypoint.sh /entrypoint.sh +RUN chmod a+x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/hadoop/hadoop/entrypoint.sh b/hadoop/hadoop/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..75d06f710f416aab4780241825b64b7905438a92 --- /dev/null +++ b/hadoop/hadoop/entrypoint.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +source /functions.sh + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +case $HOST_RESOLVER in + "") + echo "No host resolver specified. Using distro default. (Specify HOST_RESOLVER to change)" + ;; + + files_only) + echo "Configure host resolver to only use files" + configureHostResolver files + ;; + + dns_only) + echo "Configure host resolver to only use dns" + configureHostResolver dns + ;; + + dns_files) + echo "Configure host resolver to use in order dns, files" + configureHostResolver dns files + ;; + + files_dns) + echo "Configure host resolver to use in order files, dns" + configureHostResolver files dns + ;; + + *) + echo "Unrecognised network resolver configuration [${HOST_RESOLVER}]: allowed values are files_only, dns_only, dns_files, files_dns. Ignoring..." + ;; +esac + + +if [ -n "$HADOOP_CUSTOM_CONF_DIR" ]; then + if [ -d "$HADOOP_CUSTOM_CONF_DIR" ]; then + for f in `ls $HADOOP_CUSTOM_CONF_DIR/`; do + echo "Applying custom Hadoop configuration file: $f" + ln -sfn "$HADOOP_CUSTOM_CONF_DIR/$f" "/etc/hadoop/$f" + done + else + echo >&2 "Hadoop custom configuration directory not found or not a directory. Ignoring: $HADOOP_CUSTOM_CONF_DIR" + fi +fi + +exec $@ diff --git a/hadoop/hadoop/functions.sh b/hadoop/hadoop/functions.sh new file mode 100755 index 0000000000000000000000000000000000000000..1d677a9009cdcfd66e3fdc122f364225f6ce0907 --- /dev/null +++ b/hadoop/hadoop/functions.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +apache_mirror() { + BASE_URI=$(curl -L http://www.apache.org/dyn/closer.cgi$1\?asjson\=1 | grep -Eo '"preferred":.*?".*?[^\\]"' | cut -d ' ' -f 2 | sed 's/"//g') + echo $BASE_URI$1 +} + +# Attempt to download Apache project from mirror, otherwise fallback to Apache archive. places +# downloaded file in the `/tmp` directory +# +# Arguments: +# URL path to project archive +apache_dl() { + DL_URL=$(apache_mirror $1) + APACHE_FN=$(basename $1) + curl -fSL $DL_URL -o /tmp/$APACHE_FN + if [ $? != 0 ]; then + # Not in mirror, download from archive + curl -fSL "https://archive.apache.org/dist/$1" -o /tmp/$APACHE_FN + SIG_URL=https://archive.apache.org/dist/$1.asc + else + # Downloaded from mirror, grab keys from release repo + SIG_URL=https://dist.apache.org/repos/dist/release/$1.asc + fi + curl -fSL "$SIG_URL" -o /tmp/$APACHE_FN.asc + echo "/tmp/$APACHE_FN" +} + +# Verifies Apache signature using package keys +# +# Arguments: +# Path to package to verify, signature should be in same path with .asc appended to name +# URL path to KEYS file for Apache project +apache_verify() { + curl -fSL https://dist.apache.org/repos/dist/release/$2 -o /tmp/KEYS + gpg --no-default-keyring --keyring /tmp/keys.gpg --import /tmp/KEYS + if gpgv --keyring /tmp/keys.gpg $1.asc $1; then + rm -rf /tmp/keys.gpg* /tmp/KEYS + return 0 + else + echo "Validation of signature failed for $1" + return 1 + fi +} + +# Downloads and installs Apache project in `/opt` directory. +# +# Arguments: +# URL path to Apache project archive to download +# URL path to KEYS file for Apache project +apache_install() { + APACHE_FN=$(apache_dl $1) + if apache_verify $APACHE_FN $2; then + tar -xf $APACHE_FN -C /opt/ + rm -rf $APACHE_FN* + return 0 + else + echo "Install failed: $1" + return 1 + fi +} + +addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="$name${value}" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty $path $name "$value" + done +} + +configureHostResolver() { + sed -i "/hosts:/ s/.*/hosts: $*/" /etc/nsswitch.conf +} diff --git a/hadoop/hadoop/hooks/build b/hadoop/hadoop/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..0c6c8c43beedf033041f0036cf822f6fcd71538d --- /dev/null +++ b/hadoop/hadoop/hooks/build @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ $SOURCE_BRANCH != "master" ]; then + HADOOP_VERSION=$SOURCE_BRANCH +else + HADOOP_VERSION=3.2.1 +fi + +echo "Building $IMAGE_NAME for Hadoop $HADOOP_VERSION..." + +HADOOP_TAG=$(echo $IMAGE_NAME | cut -d ':' -f 2) + +docker build \ + --build-arg HADOOP_VERSION=$HADOOP_VERSION \ + --build-arg HADOOP_TAG=$HADOOP_TAG \ + -t $IMAGE_NAME . diff --git a/hadoop/hadoop/hooks/post_push b/hadoop/hadoop/hooks/post_push new file mode 100644 index 0000000000000000000000000000000000000000..a1df6b98975d7f629e4188947bb55f233d3df5e3 --- /dev/null +++ b/hadoop/hadoop/hooks/post_push @@ -0,0 +1,6 @@ +#!/bin/bash + +# Invoke dependent build triggers +for url in $(echo $BUILD_TRIGGERS | sed "s/,/ /g"); do + curl -X POST -H "Content-Type: application/json" --data "{ \"build\": true, \"source_name\": \"$SOURCE_BRANCH\" }" $url +done diff --git a/hadoop/hadoop/sources.list b/hadoop/hadoop/sources.list new file mode 100644 index 0000000000000000000000000000000000000000..20f9a2bbbc6f336d56955d65829500a2f9b6fe76 --- /dev/null +++ b/hadoop/hadoop/sources.list @@ -0,0 +1,8 @@ +deb http://mirrors.aliyun.com/debian/ stretch main non-free contrib +deb-src http://mirrors.aliyun.com/debian/ stretch main non-free contrib +deb http://mirrors.aliyun.com/debian-security stretch/updates main +deb-src http://mirrors.aliyun.com/debian-security stretch/updates main +deb http://mirrors.aliyun.com/debian/ stretch-updates main non-free contrib +deb-src http://mirrors.aliyun.com/debian/ stretch-updates main non-free contrib +deb http://mirrors.aliyun.com/debian/ stretch-backports main non-free contrib +deb-src http://mirrors.aliyun.com/debian/ stretch-backports main non-free contrib diff --git a/hadoop/historyserver/Dockerfile b/hadoop/historyserver/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b0db8d891523f13665ef27202363020b0e20c4b5 --- /dev/null +++ b/hadoop/historyserver/Dockerfile @@ -0,0 +1,11 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline +RUN mkdir -p /hadoop/yarn/timeline +VOLUME /hadoop/yarn/timeline + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/historyserver/hooks/build b/hadoop/historyserver/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/historyserver/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/historyserver/run.sh b/hadoop/historyserver/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ce663321e8f93a29de055d3f491614c74544f73 --- /dev/null +++ b/hadoop/historyserver/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver diff --git a/hadoop/namenode-checkpoint/Dockerfile b/hadoop/namenode-checkpoint/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..920d53d94eb9725ddde01041914a211970fc7df6 --- /dev/null +++ b/hadoop/namenode-checkpoint/Dockerfile @@ -0,0 +1,10 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name +RUN mkdir -p /hadoop/dfs/name/checkpoint + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/namenode-checkpoint/hooks/build b/hadoop/namenode-checkpoint/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/namenode-checkpoint/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/namenode-checkpoint/run.sh b/hadoop/namenode-checkpoint/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..74f7b8534032f79015e53f641f75cfd3185bd80a --- /dev/null +++ b/hadoop/namenode-checkpoint/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -checkpoint diff --git a/hadoop/namenode/Dockerfile b/hadoop/namenode/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..cfdaee4c3195f73df8a8a513e43621178dfa3cae --- /dev/null +++ b/hadoop/namenode/Dockerfile @@ -0,0 +1,11 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name +RUN mkdir -p /hadoop/dfs/name +VOLUME /hadoop/dfs/name + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/namenode/hooks/build b/hadoop/namenode/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/namenode/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/namenode/run.sh b/hadoop/namenode/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..8ef079702580020c835c12b3ced9a50c6cef35e2 --- /dev/null +++ b/hadoop/namenode/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` +if [ ! -d $namedir ]; then + echo "Namenode name directory not found: $namedir" + exit 2 +fi + +if [ -z "$CLUSTER_NAME" ]; then + echo "Cluster name not specified" + exit 2 +fi + +if [ "`ls -A $namedir`" == "" ]; then + echo "Formatting namenode name directory: $namedir" + $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME +fi + +$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode diff --git a/hadoop/nodemanager/Dockerfile b/hadoop/nodemanager/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..bd4e94c61290ca554862115eaaf93f9c2270a05c --- /dev/null +++ b/hadoop/nodemanager/Dockerfile @@ -0,0 +1,7 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/nodemanager/hooks/build b/hadoop/nodemanager/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/nodemanager/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/nodemanager/run.sh b/hadoop/nodemanager/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..115bcdb1d7f5f013098c4774f715f0dbd5c23127 --- /dev/null +++ b/hadoop/nodemanager/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR nodemanager diff --git a/hadoop/resourcemanager/Dockerfile b/hadoop/resourcemanager/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..bd4e94c61290ca554862115eaaf93f9c2270a05c --- /dev/null +++ b/hadoop/resourcemanager/Dockerfile @@ -0,0 +1,7 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/hadoop/resourcemanager/hooks/build b/hadoop/resourcemanager/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/resourcemanager/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/resourcemanager/run.sh b/hadoop/resourcemanager/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..c1bdb94cd61c34e593d936558bd1bfd7f4983a4e --- /dev/null +++ b/hadoop/resourcemanager/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR resourcemanager diff --git a/hadoop/spark/Dockerfile b/hadoop/spark/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a5d6bb297650d2a2f8764f62036b01bac2ae0ac9 --- /dev/null +++ b/hadoop/spark/Dockerfile @@ -0,0 +1,29 @@ +ARG HADOOP_TAG +FROM registry-vpc.cn-hangzhou.aliyuncs.com/schbrain/hadoop:${HADOOP_TAG} + +ARG SPARK_VERSION=2.4.5 +ENV SPARK_VERSION ${SPARK_VERSION} +ENV SPARK_HOME=/opt/spark-$SPARK_VERSION + +RUN set -x \ + && . /functions.sh \ + && apache_install spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz spark/KEYS \ + && mv /opt/spark-$SPARK_VERSION-* $SPARK_HOME + +WORKDIR $SPARK_HOME +ENV PATH $SPARK_HOME/bin:$PATH + +ADD spark-entrypoint.sh / +ADD spark-historyserver.sh / +ADD spark-master.sh / +ADD spark-slave.sh / + +RUN chmod a+x \ + /spark-entrypoint.sh \ + /spark-historyserver.sh \ + /spark-master.sh \ + /spark-slave.sh + +RUN echo "export SPARK_DIST_CLASSPATH=$(hadoop classpath)" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh + +ENTRYPOINT ["/spark-entrypoint.sh"] diff --git a/hadoop/spark/hooks/build b/hadoop/spark/hooks/build new file mode 100644 index 0000000000000000000000000000000000000000..b74a09e55c419469fdf7856bca25866a4a7e374b --- /dev/null +++ b/hadoop/spark/hooks/build @@ -0,0 +1,3 @@ +#!/bin/bash + +source ../hadoop/hooks/build \ No newline at end of file diff --git a/hadoop/spark/spark-entrypoint.sh b/hadoop/spark/spark-entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..b81e7ff5af91b4aca8318d55e68b78b757bfbdc4 --- /dev/null +++ b/hadoop/spark/spark-entrypoint.sh @@ -0,0 +1,41 @@ +#!/bin/bash + + +for c in `printenv | perl -sne 'print "$1 " if m/^SPARK_CONF_(.+?)=.*/'`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'` + var="SPARK_CONF_${c}" + value=${!var} + echo "Setting SPARK property $name=$value" + echo $name $value >> $SPARK_HOME/conf/spark-defaults.conf +done + +case $1 in + master) + shift + exec /entrypoint.sh /spark-master.sh $@ + ;; + slave) + shift + exec /entrypoint.sh /spark-slave.sh $@ + ;; + historyserver) + shift + exec /entrypoint.sh /spark-historyserver.sh $@ + ;; + submit) + shift + exec /entrypoint.sh spark-submit $@ + ;; + *) + + if [ "$HADOOP_ON_CLASSPATH" = "1" ]; then + export CLASSPATH="$(hadoop classpath)${CLASSPATH:+:$CLASSPATH}" + fi + + if [ "$SPARK_ON_CLASSPATH" = "1" ]; then + export CLASSPATH="${SPARK_HOME}/jars/*${CLASSPATH:+:$CLASSPATH}" + fi + + exec /entrypoint.sh $@ + ;; +esac diff --git a/hadoop/spark/spark-historyserver.sh b/hadoop/spark/spark-historyserver.sh new file mode 100644 index 0000000000000000000000000000000000000000..e2d4842b0556d8c0aa378201d91645de708b61a6 --- /dev/null +++ b/hadoop/spark/spark-historyserver.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +spark-class org.apache.spark.deploy.history.HistoryServer diff --git a/hadoop/spark/spark-master.sh b/hadoop/spark/spark-master.sh new file mode 100644 index 0000000000000000000000000000000000000000..155eaac11334ad15079aaf0267f21ab9417b8de5 --- /dev/null +++ b/hadoop/spark/spark-master.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +spark-class org.apache.spark.deploy.master.Master $@ diff --git a/hadoop/spark/spark-slave.sh b/hadoop/spark/spark-slave.sh new file mode 100644 index 0000000000000000000000000000000000000000..40bd2db455c742a2a9a732e4cc974a3c6e9b863c --- /dev/null +++ b/hadoop/spark/spark-slave.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +spark-class org.apache.spark.deploy.worker.Worker $@