Scalable airflow with Kubernetes + Git Sync

Airflow kubernetes architecture

Airflow docker image

apache-airflow[kubernetes]==1.10.6
#!/bin/bashairflow initdb
airflow webserver -p 8080 &
airflow scheduler

Dags Repository

Airflow Deployment

GIT_SYNC_REPO: “https://github.com/xyz/dags-airflow"
GIT_SYNC_BRANCH: “master”
GIT_SYNC_ROOT: “/git”
GIT_SYNC_DEST: “sync”
GIT_SYNC_DEPTH: “1”
GIT_SYNC_ONE_TIME: “false”
GIT_SYNC_WAIT: “60”
GIT_SYNC_USERNAME: “git_username”
GIT_KNOWN_HOSTS: “false”
GIT_PASSWORD: “242452”

Airflow Config File

executor = KubernetesExecutor
worker_container_repository = airflow
worker_container_tag = 1.2
worker_container_image_pull_policy = IfNotPresent
delete_worker_pods = True
namespace = dataanalytics
worker_service_account_name = k8s-cronjob-autoscaler
in_cluster = True
git_repo = https://github.com/xyz/dags-airflow
git_branch = master
git_subpath = dags
git_user = git_username
git_password = 242452
git_sync_root = /git
git_sync_dest = sync
git_sync_depth = 1
git_dags_folder_mount_point = /opt/airflow/dags
git_sync_container_repository = k8s.gcr.io/git-sync
git_sync_container_tag = v3.1.2
git_sync_init_container_name = git-sync-clone
remote_logging = True
remote_log_conn_id = s3_connection
remote_base_log_folder = s3://${ENVIRONMENT}-dataplatform-logs/airflow
AIRFLOW__CORE__REMOTE_LOGGING = True
AIRFLOW__CORE__REMOTE_LOG_CONN_ID = s3_connection
AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER = s3://${ENVIRONMENT}-dataplatform-logs/airflow
AIRFLOW__CORE__ENCRYPT_S3_LOGS = False

Conclusion

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store