Setting up the Config

Create a file config.py with the following code -

from datetime import datetime

def get_config(bucket_name, region_name):

    config = {}

    config["job_level"] = {
        "region_name": region_name,
        "run_hyperparameter_opt": "no"
    }

    config["train_model"] = {
        "sagemaker_role": "AirflowSageMakerExecutionRole",
        "estimator_config": {
            "train_instance_count": 1,
            "train_instance_type": "ml.m5.2xlarge",
            "train_volume_size": 5,   # %GB storage
            "train_max_run": 3600,
            "output_path": "s3://{}/xgboost/output".format(bucket_name), 
            "hyperparameters": {
                "feature_dim": "178729",
                "epochs": "10",
                "mini_batch_size": "200",
                "num_factors": "64",
                "predictor_type": "regressor",
                "max_depth": "5",
                "eta": "0.2",
                "objective": "reg:linear",
                "early_stopping_rounds": "10",
                "num_round": "150"
                }
        },
        "inputs": {
            "train": "s3://{}/xgboost/train/train.csv".format(bucket_name),
            "validation": "s3://{}/xgboost/validate/validate.csv".format(bucket_name)
        }    
    }

    config["batch_transform"] = {
        "transform_config": {
            "instance_count": 1,
            "instance_type": "ml.c4.xlarge",
            "data": "s3://{}/xgboost/test/".format(bucket_name),
            "data_type": "S3Prefix",
            "content_type": "text/csv",
            "strategy": "SingleRecord",
            "split_type": "Line",        
            "output_path": "s3://{}/transform/".format(bucket_name)
        }
    }

    return config