D6906: ci: implement a new CI system for Mercurial
indygreg (Gregory Szorc)
phabricator at mercurial-scm.org
Sun Sep 29 16:51:17 UTC 2019
indygreg updated this revision to Diff 16678.
REPOSITORY
rHG Mercurial
CHANGES SINCE LAST UPDATE
https://phab.mercurial-scm.org/D6906?vs=16677&id=16678
CHANGES SINCE LAST ACTION
https://phab.mercurial-scm.org/D6906/new/
REVISION DETAIL
https://phab.mercurial-scm.org/D6906
AFFECTED FILES
.hgignore
contrib/ci/README.rst
contrib/ci/lambda_functions/ci.py
contrib/ci/lambda_functions/web.py
contrib/ci/terraform/account_manager.tf
contrib/ci/terraform/cloudwatch.tf
contrib/ci/terraform/iam.tf
contrib/ci/terraform/init.tf
contrib/ci/terraform/job_executor.tf
contrib/ci/terraform/job_result_reactor.tf
contrib/ci/terraform/repo_change_reactor.tf
contrib/ci/terraform/repo_poll.tf
contrib/ci/terraform/storage.tf
contrib/ci/terraform/web.tf
contrib/ci/terraform/worker.tf
tests/test-check-code.t
tests/test-check-py3-compat.t
CHANGE DETAILS
diff --git a/tests/test-check-py3-compat.t b/tests/test-check-py3-compat.t
--- a/tests/test-check-py3-compat.t
+++ b/tests/test-check-py3-compat.t
@@ -6,6 +6,7 @@
#if no-py3
$ testrepohg files 'set:(**.py)' \
> -X contrib/automation/ \
+ > -X contrib/ci/ \
> -X contrib/packaging/hgpackaging/ \
> -X contrib/packaging/inno/ \
> -X contrib/packaging/wix/ \
diff --git a/tests/test-check-code.t b/tests/test-check-code.t
--- a/tests/test-check-code.t
+++ b/tests/test-check-code.t
@@ -20,6 +20,8 @@
Skipping contrib/automation/hgautomation/ssh.py it has no-che?k-code (glob)
Skipping contrib/automation/hgautomation/windows.py it has no-che?k-code (glob)
Skipping contrib/automation/hgautomation/winrm.py it has no-che?k-code (glob)
+ Skipping contrib/ci/lambda_functions/ci.py it has no-che?k-code (glob)
+ Skipping contrib/ci/lambda_functions/web.py it has no-che?k-code (glob)
Skipping contrib/packaging/hgpackaging/downloads.py it has no-che?k-code (glob)
Skipping contrib/packaging/hgpackaging/inno.py it has no-che?k-code (glob)
Skipping contrib/packaging/hgpackaging/py2exe.py it has no-che?k-code (glob)
diff --git a/contrib/ci/terraform/worker.tf b/contrib/ci/terraform/worker.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/worker.tf
@@ -0,0 +1,33 @@
+# Defines resources for the worker component of the CI system. This
+# revolves around the EC2 worker instances themselves.
+
+resource "aws_iam_role" "ci_worker" {
+ name = "ci-worker"
+ description = "Provides capabilities needed for an EC2 instance doing work"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_ec2.json
+}
+
+resource "aws_iam_instance_profile" "ci_worker" {
+ name = "ci-worker"
+ role = aws_iam_role.ci_worker.name
+}
+
+data "aws_iam_policy_document" "ci_worker" {
+ # Allow CI worker EC2 instances to write artifacts to S3.
+ statement {
+ effect = "Allow"
+ actions = [
+ "s3:PutObject",
+ "s3:PutObjectAcl",
+ ]
+ resources = [
+ "${aws_s3_bucket.mercurial-ci.arn}/jobs/*",
+ ]
+ }
+}
+
+resource "aws_iam_role_policy" "ci_worker" {
+ role = aws_iam_role.ci_worker.name
+ name = aws_iam_role.ci_worker.name
+ policy = data.aws_iam_policy_document.ci_worker.json
+}
diff --git a/contrib/ci/terraform/web.tf b/contrib/ci/terraform/web.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/web.tf
@@ -0,0 +1,163 @@
+resource "aws_iam_role" "lambda_ci_web" {
+ name = "lambda-ci-web"
+ description = "For Lambda function providing web site functionality"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+resource "aws_lambda_function" "ci_web" {
+ function_name = "ci-web"
+ description = "Web interface to CI system"
+ filename = data.archive_file.lambda_ci.output_path
+ handler = "web.handler"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ timeout = 30
+ role = aws_iam_role.lambda_ci_web.arn
+ environment {
+ variables = {
+ S3_BUCKET = aws_s3_bucket.mercurial-ci.bucket
+ DYNAMODB_REPO_POLL_TABLE = aws_dynamodb_table.ci_repo_poll.name
+ DYNAMODB_PUSH_TABLE = aws_dynamodb_table.ci_push.name
+ DYNAMODB_JOB_TABLE = aws_dynamodb_table.ci_job.name
+ DYNAMODB_TEST_RESULT_TABLE = aws_dynamodb_table.ci_test_result.name
+ }
+ }
+}
+
+resource "aws_cloudwatch_log_group" "lambda_ci_web" {
+ name = "/aws/lambda/${aws_lambda_function.ci_web.function_name}"
+ retention_in_days = 7
+}
+
+data "aws_iam_policy_document" "lambda_ci_web" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_web.arn]
+ }
+ # Allow Lambda function to read data from DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:GetItem",
+ "dynamodb:Query",
+ "dynamodb:Scan",
+ ]
+ resources = [
+ aws_dynamodb_table.ci_repo_poll.arn,
+ aws_dynamodb_table.ci_push.arn,
+ aws_dynamodb_table.ci_job.arn,
+ aws_dynamodb_table.ci_test_result.arn,
+ ]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_ci_web" {
+ role = aws_iam_role.lambda_ci_web.name
+ name = aws_iam_role.lambda_ci_web.name
+ policy = data.aws_iam_policy_document.lambda_ci_web.json
+}
+
+resource "aws_api_gateway_rest_api" "ci_web" {
+ name = "ci-web"
+ description = "Web interface for CI"
+}
+
+resource "aws_api_gateway_resource" "ci_web_proxy" {
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ parent_id = aws_api_gateway_rest_api.ci_web.root_resource_id
+ path_part = "{proxy+}"
+}
+
+resource "aws_api_gateway_method" "ci_web_proxy" {
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ resource_id = aws_api_gateway_resource.ci_web_proxy.id
+ http_method ="ANY"
+ authorization = "NONE"
+}
+
+resource "aws_api_gateway_integration" "ci_web_proxy" {
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ resource_id = aws_api_gateway_method.ci_web_proxy.resource_id
+ http_method = aws_api_gateway_method.ci_web_proxy.http_method
+ integration_http_method = "POST"
+ type = "AWS_PROXY"
+ uri = aws_lambda_function.ci_web.invoke_arn
+}
+
+resource "aws_api_gateway_method" "ci_web_proxy_root" {
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ resource_id = aws_api_gateway_rest_api.ci_web.root_resource_id
+ http_method = "ANY"
+ authorization = "NONE"
+}
+
+resource "aws_api_gateway_integration" "ci_web_proxy_root" {
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ resource_id = aws_api_gateway_method.ci_web_proxy_root.resource_id
+ http_method = aws_api_gateway_method.ci_web_proxy_root.http_method
+ integration_http_method = "POST"
+ type = "AWS_PROXY"
+ uri = aws_lambda_function.ci_web.invoke_arn
+}
+
+resource "aws_api_gateway_deployment" "ci_web" {
+ depends_on = [
+ aws_api_gateway_integration.ci_web_proxy,
+ aws_api_gateway_integration.ci_web_proxy_root,
+ ]
+ rest_api_id = aws_api_gateway_rest_api.ci_web.id
+ stage_name = "prod"
+}
+
+output "ci_web_raw_url" {
+ value = aws_api_gateway_deployment.ci_web.invoke_url
+}
+
+# Allow Lambda function to be invoked by API Gateway.
+resource "aws_lambda_permission" "ci_web" {
+ statement_id = "AllowAPIGatewayInvoke"
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.ci_web.function_name
+ principal = "apigateway.amazonaws.com"
+ source_arn = "${aws_api_gateway_rest_api.ci_web.execution_arn}/*/*"
+}
+
+# x509 certificate for web site.
+resource "aws_acm_certificate" "ci" {
+ domain_name = var.ci_hostname
+ validation_method = "NONE"
+ tags = {}
+ options {
+ certificate_transparency_logging_preference = "DISABLED"
+ }
+}
+
+resource "aws_api_gateway_domain_name" "ci_web" {
+ domain_name = var.ci_hostname
+ regional_certificate_arn = aws_acm_certificate.ci.arn
+ security_policy = "TLS_1_2"
+
+ endpoint_configuration {
+ types = ["REGIONAL"]
+ }
+}
+
+output "hg_ci_results_regional_zone_id" {
+ value = aws_api_gateway_domain_name.ci_web.regional_zone_id
+}
+
+output "hg_ci_results_regional_domain_name" {
+ value = aws_api_gateway_domain_name.ci_web.regional_domain_name
+}
+
+resource "aws_api_gateway_base_path_mapping" "ci_results" {
+ api_id = aws_api_gateway_rest_api.ci_web.id
+ stage_name = aws_api_gateway_deployment.ci_web.stage_name
+ domain_name = aws_api_gateway_domain_name.ci_web.domain_name
+}
diff --git a/contrib/ci/terraform/storage.tf b/contrib/ci/terraform/storage.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/storage.tf
@@ -0,0 +1,87 @@
+# Tracks repository poll state.
+resource "aws_dynamodb_table" "ci_repo_poll" {
+ name = "ci-repo-poll"
+ billing_mode = "PAY_PER_REQUEST"
+
+ attribute {
+ name = "repo"
+ type = "S"
+ }
+
+ hash_key = "repo"
+}
+
+# Tracks pushes to repositories. Effectively exposes a timeline
+# of events.
+resource "aws_dynamodb_table" "ci_push" {
+ name = "ci-push"
+ billing_mode = "PAY_PER_REQUEST"
+
+ attribute {
+ name = "repo"
+ type = "S"
+ }
+ attribute {
+ name = "push_id"
+ type = "S"
+ }
+
+ hash_key = "repo"
+ range_key = "push_id"
+
+ stream_enabled = true
+ stream_view_type = "NEW_AND_OLD_IMAGES"
+}
+
+# Tracks individual CI jobs and their results.
+resource "aws_dynamodb_table" "ci_job" {
+ name = "ci-job"
+ billing_mode = "PAY_PER_REQUEST"
+
+ attribute {
+ name = "job_id"
+ type = "S"
+ }
+
+ hash_key = "job_id"
+}
+
+# Tracks results for individual tests in each job.
+resource "aws_dynamodb_table" "ci_test_result" {
+ name = "ci-test-result"
+ billing_mode = "PAY_PER_REQUEST"
+
+ attribute {
+ name = "job_id"
+ type = "S"
+ }
+
+ attribute {
+ name = "test_name"
+ type = "S"
+ }
+
+ hash_key = "job_id"
+ range_key = "test_name"
+}
+
+resource "aws_s3_bucket" "private" {
+ bucket = "mercurial-ci-private"
+ region = "us-west-2"
+ acl = "private"
+}
+
+# Job artifacts.
+resource "aws_s3_bucket" "mercurial-ci" {
+ bucket = "mercurial-ci"
+ region = "us-west-2"
+ acl = "public-read"
+
+ lifecycle_rule {
+ id = "Purge old objects"
+ enabled = true
+ expiration {
+ days = 90
+ }
+ }
+}
diff --git a/contrib/ci/terraform/repo_poll.tf b/contrib/ci/terraform/repo_poll.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/repo_poll.tf
@@ -0,0 +1,93 @@
+# Defines resources for the component that polls repositories for new pushes.
+
+resource "aws_iam_role" "lambda_ci_repo_poll" {
+ name = "lambda-ci-repo-poll"
+ description = "For Lambda function polling Mercurial repo"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+# Lambda function that when invoked will poll a repository for new changes.
+# The environment configures which repository to poll and where to store
+# state.
+resource "aws_lambda_function" "ci_repo_poll_hg_committed" {
+ filename = data.archive_file.lambda_ci.output_path
+ function_name = "ci-repo-poll-hg-committed"
+ description = "Polls Mercurial repository for new changes"
+ handler = "ci.handle_poll_repo"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ role = aws_iam_role.lambda_ci_repo_poll.arn
+ timeout = 60
+ environment {
+ variables = {
+ REPO_URL = "https://www.mercurial-scm.org/repo/hg-committed"
+ POLL_REVS = "@ stable"
+ DYNAMODB_REPO_POLL_TABLE = aws_dynamodb_table.ci_repo_poll.name
+ DYNAMODB_PUSH_TABLE = aws_dynamodb_table.ci_push.name
+ }
+ }
+}
+
+# Log group for Lambda function.
+resource "aws_cloudwatch_log_group" "lambda_ci_repo_poll_hg_committed" {
+ name = "/aws/lambda/${aws_lambda_function.ci_repo_poll_hg_committed.function_name}"
+ retention_in_days = 7
+}
+
+# Defines policy for repo polling.
+data "aws_iam_policy_document" "lambda_hg_repo_poll" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_repo_poll_hg_committed.arn]
+ }
+ # Enable tracking poll state in DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:GetItem",
+ "dynamodb:PutItem",
+ ]
+ resources = [aws_dynamodb_table.ci_repo_poll.arn]
+ }
+ # Enable inserting pushes int DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:PutItem",
+ ]
+ resources = [aws_dynamodb_table.ci_push.arn]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_hg_repo_poll" {
+ role = aws_iam_role.lambda_ci_repo_poll.name
+ name = aws_iam_role.lambda_ci_repo_poll.name
+ policy = data.aws_iam_policy_document.lambda_hg_repo_poll.json
+}
+
+# Periodically trigger the Lambda function to poll.
+resource "aws_cloudwatch_event_rule" "trigger_ci_repo_poll_hg_committed" {
+ name = "trigger-ci-repo-poll"
+ description = "Trigger polling hg repos for new changesets"
+ schedule_expression = "rate(1 minute)"
+}
+
+resource "aws_cloudwatch_event_target" "repo_poll_hg_committed" {
+ rule = aws_cloudwatch_event_rule.trigger_ci_repo_poll_hg_committed.name
+ arn = aws_lambda_function.ci_repo_poll_hg_committed.arn
+}
+
+# Allow CloudEvent event to trigger repo poll function.
+resource "aws_lambda_permission" "hg_ci_repo_poll_hg_committed_allow_cloudwatch" {
+ statement_id = "AllowExecutionFromCloudWatch"
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.ci_repo_poll_hg_committed.function_name
+ principal = "events.amazonaws.com"
+ source_arn = aws_cloudwatch_event_rule.trigger_ci_repo_poll_hg_committed.arn
+}
diff --git a/contrib/ci/terraform/repo_change_reactor.tf b/contrib/ci/terraform/repo_change_reactor.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/repo_change_reactor.tf
@@ -0,0 +1,113 @@
+# Defines resources for the component that reacts to push events.
+
+# Invoked when a new repository push is seen.
+resource "aws_lambda_function" "ci_push_reactor" {
+ function_name = "ci-push-reactor"
+ description = "Trigger CI for Mercurial from a pushed revision"
+ filename = data.archive_file.lambda_ci.output_path
+ handler = "ci.handle_schedule_from_push"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ timeout = 300
+ role = aws_iam_role.lambda_ci_push_reactor.arn
+ environment {
+ variables = {
+ DYNAMODB_JOB_TABLE = aws_dynamodb_table.ci_job.name
+ S3_BUCKET = aws_s3_bucket.mercurial-ci.bucket
+ SQS_URL = aws_sqs_queue.ci_pending_jobs.id
+ }
+ }
+}
+
+# Logging for Lambda function.
+resource "aws_cloudwatch_log_group" "lambda_ci_push_reactor" {
+ name = "/aws/lambda/${aws_lambda_function.ci_push_reactor.function_name}"
+ retention_in_days = 7
+}
+
+# Queue holding records for ready-to-execute jobs.
+resource "aws_sqs_queue" "ci_pending_jobs" {
+ name = "ci-pending-jobs"
+ delay_seconds = 0
+ message_retention_seconds = 3600
+ # This is effectively the retry interval when received messages fail to
+ # process.
+ visibility_timeout_seconds = 120
+}
+
+resource "aws_iam_role" "lambda_ci_push_reactor" {
+ name = "lambda-ci-push-reactor"
+ description = "For Lambda function that schedules CI jobs"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+data "aws_iam_policy_document" "lambda_ci_push_reactor" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_push_reactor.arn]
+ }
+ # Allow reading DynamoDB repo push stream.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:DescribeStream",
+ "dynamodb:GetRecords",
+ "dynamodb:GetShardIterator",
+ "dynamodb:ListStreams",
+ ]
+ resources = ["${aws_dynamodb_table.ci_push.arn}/stream/*"]
+ }
+ # Allow querying EC2 state to determine what jobs can be scheduled.
+ statement {
+ effect = "Allow"
+ actions = [
+ "ec2:DescribeImages",
+ ]
+ resources = ["*"]
+ }
+ # Allow querying and recording job state in DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:PutItem",
+ "dynamodb:Scan",
+ ]
+ resources = [aws_dynamodb_table.ci_job.arn]
+ }
+ # Allow registering pending job events in SQS.
+ statement {
+ effect = "Allow"
+ actions = [
+ "sqs:SendMessage",
+ ]
+ resources = [aws_sqs_queue.ci_pending_jobs.arn]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_schedule_ci" {
+ role = aws_iam_role.lambda_ci_push_reactor.name
+ name = aws_iam_role.lambda_ci_push_reactor.name
+ policy = data.aws_iam_policy_document.lambda_ci_push_reactor.json
+}
+
+# Trigger push reactor when a repo push event is seen in DynamoDB.
+resource "aws_lambda_event_source_mapping" "dynamodb_repo_push_trigger_push_reactor" {
+ event_source_arn = aws_dynamodb_table.ci_push.stream_arn
+ function_name = aws_lambda_function.ci_push_reactor.arn
+ starting_position = "LATEST"
+ batch_size = 1
+}
+
+#resource "aws_lambda_permission" "ci_push_reactor_allow_dynamodb" {
+# statement_id = "AllowExecutionFromDynamoDB"
+# action = "lambda:InvokeFunction"
+# function_name = aws_lambda_function.ci_push_reactor.function_name
+# principal = "dynamodb.amazonaws.com"
+# source_arn = aws_dynamodb_table.ci_push.stream_arn
+#}
diff --git a/contrib/ci/terraform/job_result_reactor.tf b/contrib/ci/terraform/job_result_reactor.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/job_result_reactor.tf
@@ -0,0 +1,89 @@
+# Defines resources for the component that reacts to a job result being
+# posted.
+
+resource "aws_cloudwatch_log_group" "lambda_ci_job_result_reactor" {
+ name = "/aws/lambda/${aws_lambda_function.ci_job_result_reactor.function_name}"
+ retention_in_days = 7
+}
+
+resource "aws_iam_role" "lambda_ci_job_result_reactor" {
+ name = "lambda-ci-job-result-reactor"
+ description = "For Lambda function reacting to job results being created"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+# Invoked when an S3 object is written.
+resource "aws_lambda_function" "ci_job_result_reactor" {
+ function_name = "ci-job-result-reactor"
+ description = "Reacts to new job result artifacts being created"
+ filename = data.archive_file.lambda_ci.output_path
+ handler = "ci.handle_job_result_s3_artifact"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ timeout = 60
+ role = aws_iam_role.lambda_ci_job_result_reactor.arn
+ environment {
+ variables = {
+ DYNAMODB_JOB_TABLE = aws_dynamodb_table.ci_job.name
+ DYNAMODB_TEST_RESULT_TABLE = aws_dynamodb_table.ci_test_result.name
+ }
+ }
+}
+
+data "aws_iam_policy_document" "lambda_ci_job_result_reactor" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_job_result_reactor.arn]
+ }
+ # Allow updating job state in DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:BatchWriteItem",
+ "dynamodb:GetItem",
+ "dynamodb:PutItem",
+ "dynamodb:UpdateItem",
+ ]
+ resources = [
+ aws_dynamodb_table.ci_job.arn,
+ aws_dynamodb_table.ci_test_result.arn,
+ ]
+ }
+ # Allow reading S3 keys to retrieve job artifacts.
+ statement {
+ effect = "Allow"
+ actions = [
+ "s3:GetObject",
+ ]
+ resources = ["${aws_s3_bucket.mercurial-ci.arn}/jobs/*"]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_ci_job_result_reactor" {
+ role = aws_iam_role.lambda_ci_job_result_reactor.name
+ name = aws_iam_role.lambda_ci_job_result_reactor.name
+ policy = data.aws_iam_policy_document.lambda_ci_job_result_reactor.json
+}
+
+resource "aws_s3_bucket_notification" "ci_job_artifact_notify_result_reactor" {
+ bucket = aws_s3_bucket.mercurial-ci.bucket
+ lambda_function {
+ lambda_function_arn = aws_lambda_function.ci_job_result_reactor.arn
+ events = ["s3:ObjectCreated:*"]
+ filter_prefix = "jobs/"
+ }
+}
+
+resource "aws_lambda_permission" "lambda_ci_job_result_reactor_allow_s3" {
+ statement_id = "AllowExecutionFromS3Bucket"
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.ci_job_result_reactor.arn
+ principal = "s3.amazonaws.com"
+ source_arn = aws_s3_bucket.mercurial-ci.arn
+}
diff --git a/contrib/ci/terraform/job_executor.tf b/contrib/ci/terraform/job_executor.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/job_executor.tf
@@ -0,0 +1,190 @@
+# Defines resources for executing jobs.
+
+resource "aws_iam_role" "lambda_ci_run_pending_job" {
+ name = "lambda-ci-run-pending-job"
+ description = "For Lambda function to launch EC2 instances for pending jobs"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+# Function reacting to a pending job in the queue and trying to start it.
+resource "aws_lambda_function" "ci_run_pending_job" {
+ function_name = "ci-run-pending-job"
+ description = "Reacts to pending job events on SQS queue"
+ filename = data.archive_file.lambda_ci.output_path
+ handler = "ci.handle_pending_job"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ timeout = 60
+ role = aws_iam_role.lambda_ci_run_pending_job.arn
+}
+
+resource "aws_cloudwatch_log_group" "lambda_ci_run_pending_job" {
+ name = "/aws/lambda/${aws_lambda_function.ci_run_pending_job.function_name}"
+ retention_in_days = 7
+}
+
+data "aws_iam_policy_document" "lambda_ci_run_pending_job" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_run_pending_job.arn]
+ }
+ # Allow querying job state in S3 and putting objects there.
+ statement {
+ effect = "Allow"
+ actions = [
+ "s3:ListBucket",
+ "s3:PutObject",
+ "s3:PutObjectAcl",
+ ]
+ resources = ["${aws_s3_bucket.mercurial-ci.arn}/jobs/*"]
+ }
+ # Allow modifying SQS queue.
+ statement {
+ effect = "Allow"
+ actions = [
+ "sqs:ReceiveMessage",
+ "sqs:DeleteMessage",
+ "sqs:GetQueueAttributes",
+ ]
+ resources = [aws_sqs_queue.ci_pending_jobs.arn]
+ }
+ # Allow querying EC2 state and launching instances to run jobs.
+ statement {
+ effect = "Allow"
+ actions = [
+ "ec2:*",
+ "iam:*",
+ ]
+ resources = ["*"]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_ci_handle_pending_job" {
+ role = aws_iam_role.lambda_ci_run_pending_job.name
+ name = aws_iam_role.lambda_ci_run_pending_job.name
+ policy = data.aws_iam_policy_document.lambda_ci_run_pending_job.json
+}
+
+# Hook up SQS to Lambda function.
+resource "aws_lambda_event_source_mapping" "ci_run_pending_job_from_sqs" {
+ event_source_arn = aws_sqs_queue.ci_pending_jobs.arn
+ function_name = aws_lambda_function.ci_run_pending_job.arn
+ batch_size = 1
+}
+
+# We have another Lambda function for reacting to state changes in worker
+# instances.
+
+resource "aws_cloudwatch_log_group" "lambda_ci_instance_state_change" {
+ name = "/aws/lambda/${aws_lambda_function.ci_instance_state_change.function_name}"
+ retention_in_days = 7
+}
+
+resource "aws_iam_role" "lambda_ci_instance_state_change" {
+ name = "lambda-ci-instance-state-change"
+ description = "For Lambda function reacting to instance state changes"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_lambda.json
+}
+
+resource "aws_lambda_function" "ci_instance_state_change" {
+ function_name = "ci-instance-state-change"
+ description = "Reacts to EC2 instances changing state"
+ filename = data.archive_file.lambda_ci.output_path
+ handler = "ci.handle_instance_state_change"
+ source_code_hash = data.archive_file.lambda_ci.output_base64sha256
+ runtime = "python3.7"
+ timeout = 120
+ role = aws_iam_role.lambda_ci_instance_state_change.arn
+ environment {
+ variables = {
+ DYNAMODB_JOB_TABLE = aws_dynamodb_table.ci_job.name
+ }
+ }
+}
+
+data "aws_iam_policy_document" "lambda_ci_instance_state_change" {
+ # Allow Lambda function to write CloudWatch events.
+ statement {
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogGroup",
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = [aws_cloudwatch_log_group.lambda_ci_instance_state_change.arn]
+ }
+ # Allow querying EC2 instance state.
+ statement {
+ effect = "Allow"
+ actions = [
+ "ec2:CreateTags",
+ "ec2:DescribeInstanceAttribute",
+ "ec2:DescribeInstances",
+ ]
+ resources = ["*"]
+ }
+ # Allow updating job state in DynamoDB.
+ statement {
+ effect = "Allow"
+ actions = [
+ "dynamodb:BatchWriteItem",
+ "dynamodb:PutItem",
+ "dynamodb:UpdateItem",
+ ]
+ resources = [
+ aws_dynamodb_table.ci_job.arn,
+ aws_dynamodb_table.ci_test_result.arn,
+ ]
+ }
+ # Allow reading S3 keys to retrieve job state.
+ statement {
+ effect = "Allow"
+ actions = [
+ "s3:GetObject",
+ ]
+ resources = ["${aws_s3_bucket.mercurial-ci.arn}/jobs/*"]
+ }
+}
+
+resource "aws_iam_role_policy" "lambda_ci_instance_state_change" {
+ role = aws_iam_role.lambda_ci_instance_state_change.name
+ name = aws_iam_role.lambda_ci_instance_state_change.name
+ policy = data.aws_iam_policy_document.lambda_ci_instance_state_change.json
+}
+
+# CloudWatch Event Rule that fires whenever an instance state changes.
+resource "aws_cloudwatch_event_rule" "trigger_instance_state_change" {
+ name = "trigger-instance-state-change"
+ description = "Signals when an EC2 instance state is changing"
+ event_pattern = <<PATTERN
+{
+ "source": [
+ "aws.ec2"
+ ],
+ "detail-type": [
+ "EC2 Instance State-change Notification"
+ ]
+}
+PATTERN
+}
+
+# Have the instance change event trigger our instance change Lambda function.
+resource "aws_cloudwatch_event_target" "instance_state_change" {
+ rule = aws_cloudwatch_event_rule.trigger_instance_state_change.name
+ arn = aws_lambda_function.ci_instance_state_change.arn
+}
+
+# And allow that CloudWatch-initiated trigger of Lambda to work.
+resource "aws_lambda_permission" "ci_instance_state_change_allow_cloudwatch" {
+ statement_id = "AllowExecutionFromCloudWatch"
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.ci_instance_state_change.function_name
+ principal = "events.amazonaws.com"
+ source_arn = aws_cloudwatch_event_rule.trigger_instance_state_change.arn
+}
diff --git a/contrib/ci/terraform/init.tf b/contrib/ci/terraform/init.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/init.tf
@@ -0,0 +1,42 @@
+provider "archive" {}
+
+terraform {
+ backend "s3" {
+ bucket = "mercurial-ci-private"
+ key = "terraform/hg.tfstate"
+ region = "us-west-2"
+ }
+}
+
+variable "account_id" {
+ # gregoryszorc-hg
+ default = "585867089697"
+}
+
+variable "parent_account_id" {
+ # gregoryszorc
+ default = "381522727988"
+}
+
+variable "parent_account_manager_role_arn" {
+ default = "arn:aws:iam::381522727988:role/lambda-hg-account-manage"
+}
+
+variable "ci_hostname" {
+ # Route53 defined in parent account.
+ default = "ci.hg.gregoryszorc.com"
+}
+
+resource "aws_iam_account_alias" "alias" {
+ account_alias = "gregoryszorc-hg"
+}
+
+provider "aws" {
+ region = "us-west-2"
+}
+
+data "archive_file" "lambda_ci" {
+ type = "zip"
+ output_path = "${path.root}/../../../build/lambda_ci.zip"
+ source_dir = "${path.root}/../lambda_functions"
+}
diff --git a/contrib/ci/terraform/iam.tf b/contrib/ci/terraform/iam.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/iam.tf
@@ -0,0 +1,52 @@
+# Generic policy to allow an EC2 service to assume a role.
+data "aws_iam_policy_document" "assume_role_ec2" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "Service"
+ identifiers = ["ec2.amazonaws.com"]
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+# Generic policy to allow a Lambda function to assume a role.
+data "aws_iam_policy_document" "assume_role_lambda" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "Service"
+ identifiers = ["lambda.amazonaws.com"]
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+resource "aws_iam_role" "admin" {
+ name = "admin"
+ description = "Full administrator access"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_admin_from_parent.json
+}
+
+resource "aws_iam_role_policy_attachment" "admin-administrator" {
+ role = aws_iam_role.admin.name
+ policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+resource "aws_iam_group" "admins" {
+ name = "admins"
+}
+
+resource "aws_iam_group_policy_attachment" "admins-administrator" {
+ group = aws_iam_group.admins.name
+ policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+resource "aws_iam_user" "hg" {
+ name = "hg"
+}
+
+resource "aws_iam_user_group_membership" "hg-group-admins" {
+ user = aws_iam_user.hg.name
+ groups = [aws_iam_group.admins.name]
+}
diff --git a/contrib/ci/terraform/cloudwatch.tf b/contrib/ci/terraform/cloudwatch.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/cloudwatch.tf
@@ -0,0 +1,4 @@
+resource "aws_cloudwatch_log_group" "ssm_run_power_shell_script" {
+ name = "/aws/ssm/AWS-RunPowerShellScript"
+ retention_in_days = 7
+}
diff --git a/contrib/ci/terraform/account_manager.tf b/contrib/ci/terraform/account_manager.tf
new file mode 100644
--- /dev/null
+++ b/contrib/ci/terraform/account_manager.tf
@@ -0,0 +1,76 @@
+# This file defines resources that allows an external / parent account to
+# manage this account.
+
+# Assume role policy which can be used by the root user in the
+# parent account.
+data "aws_iam_policy_document" "assume_role_admin_from_parent" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "AWS"
+ identifiers = ["arn:aws:iam::${var.parent_account_id}:root"]
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+# Allows the purging lambda function from the parent account to assume roles.
+data "aws_iam_policy_document" "assume_role_account_manager" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "AWS"
+ identifiers = [var.parent_account_manager_role_arn]
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+# Allows the account manager to do anything.
+data "aws_iam_policy_document" "account_manager" {
+ statement {
+ effect = "Allow"
+ actions = ["*"]
+ resources = ["*"]
+ }
+}
+
+resource "aws_iam_role" "account_manager" {
+ name = "account-manager"
+ description = "Assumed to audit and clean up this account"
+ assume_role_policy = data.aws_iam_policy_document.assume_role_account_manager.json
+}
+
+output "iam_role_account_manager_arn" {
+ value = aws_iam_role.account_manager.arn
+}
+
+resource "aws_iam_role_policy" "account_manager" {
+ role = aws_iam_role.account_manager.name
+ name = aws_iam_role.account_manager.name
+ policy = data.aws_iam_policy_document.account_manager.json
+}
+
+# Allow parent account to reach into our Terraform state.
+data "aws_iam_policy_document" "parent_account_terraform_access" {
+ statement {
+ effect = "Allow"
+ actions = [
+ "s3:ListBucket",
+ "s3:GetObject",
+ ]
+ principals {
+ type = "AWS"
+ identifiers = ["arn:aws:iam::${var.parent_account_id}:user/gps"]
+ }
+ resources = [
+ aws_s3_bucket.private.arn,
+ "${aws_s3_bucket.private.arn}/terraform/*",
+ ]
+ }
+}
+
+resource "aws_s3_bucket_policy" "parent_account_terraform_access" {
+ bucket = aws_s3_bucket.private.bucket
+ policy = data.aws_iam_policy_document.parent_account_terraform_access.json
+}
diff --git a/contrib/ci/lambda_functions/web.py b/contrib/ci/lambda_functions/web.py
new file mode 100644
--- /dev/null
+++ b/contrib/ci/lambda_functions/web.py
@@ -0,0 +1,390 @@
+# web.py - Web component of Mercurial CI
+#
+# Copyright 2019 Gregory Szorc <gregory.szorc at gmail.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+# no-check-code because Python 3 native.
+
+import datetime
+import html
+import os
+
+import boto3
+from boto3.dynamodb.conditions import (
+ Key,
+)
+
+e = html.escape
+
+
+HTML_HEADERS = {
+ 'Content-Security-Policy': "default-src: https:; img-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; frame-ancestors 'none'",
+ 'Content-Type': 'text/html; charset=utf-8',
+ 'Strict-Transport-Security': 'max-age=63072000',
+ 'X-Content-Type-Options': 'nosniff',
+ 'X-Frame-Options': 'DENY',
+ 'X-XSS-Protection': '1; mode=block',
+}
+
+
+def handler(event, context):
+ path = event['path']
+ print('received request for %s' % path)
+
+ dynamodb = boto3.resource('dynamodb')
+
+ repo_poll_table = dynamodb.Table(os.environ['DYNAMODB_REPO_POLL_TABLE'])
+ push_table = dynamodb.Table(os.environ['DYNAMODB_PUSH_TABLE'])
+ job_table = dynamodb.Table(os.environ['DYNAMODB_JOB_TABLE'])
+ test_result_table = dynamodb.Table(os.environ['DYNAMODB_TEST_RESULT_TABLE'])
+
+ if path == '/':
+ return render_main(repo_poll_table, push_table, job_table)
+ elif path.startswith('/job-info/'):
+ job_id = path[len('/job-info/'):]
+ return render_job_info(job_table, test_result_table, job_id)
+ else:
+ return {
+ 'statusCode': 404,
+ 'headers': HTML_HEADERS,
+ 'body': '<html>not found</html>',
+ }
+
+
+def render_main(repo_poll_table, push_table, job_table):
+ html = [
+ '<html>',
+ '<body>',
+ '<style type="text/css">',
+ 'th { padding-left: 10px; }',
+ 'td { padding-left: 20px; text-align: right; }',
+ '.mono { font-family: monospace; }',
+ '</style>',
+ ]
+
+ for repo_entry in repo_poll_table.scan(Select='ALL_ATTRIBUTES')['Items']:
+ repo_name = repo_entry['repo']
+ repo_url = repo_entry['repo_url']
+
+ html.append('<h1><a href="%s">%s</a></h1>' % (e(repo_url, quote=True), e(repo_name)))
+
+ res = push_table.query(
+ KeyConditionExpression=Key('repo').eq(repo_name),
+ Select='ALL_ATTRIBUTES',
+ Limit=10,
+ ScanIndexForward=False,
+ )
+ for push in res['Items']:
+ html.append(push_info(push, repo_url))
+
+ # Now find all jobs for this push.
+
+ cset_results = {}
+
+ for entry in get_jobs_metdata(job_table, repo_name, push['node']):
+ job_name = entry['job_name']
+ build_number = entry['build_number']
+
+ if job_name not in cset_results:
+ cset_results[job_name] = {}
+
+ job_results = cset_results[job_name]
+
+ job_results[build_number] = entry
+
+ html.extend([
+ '<table>',
+ '<tr>',
+ '<th>Job Name</th>',
+ '<th>Run</th>',
+ '<th>Job State</th>',
+ '<th>Scheduled At</th>',
+ '<th>Start Delay</th>',
+ '<th>Execution Time</th>',
+ '<th>Total Tests</th>',
+ '<th>Passed</th>',
+ '<th>Failed</th>',
+ '<th>Skipped</th>',
+ '<th>Artifacts</th>',
+ '</tr>',
+ ])
+
+ for job_name, job_results in sorted(cset_results.items()):
+ for build_number, job_info in sorted(job_results.items()):
+ if 'output_log_url' in job_info:
+ output = '<a href="%s">output log</a>' % e(job_info['output_log_url'])
+ else:
+ output = ''
+
+ schedule_time = datetime.datetime.utcfromtimestamp(job_info['schedule_time'])
+
+ if 'start_time' in job_info:
+ start_time = datetime.datetime.utcfromtimestamp(job_info['start_time'])
+ start_delay = '%ds' % (start_time - schedule_time).total_seconds()
+ else:
+ start_delay = 'n/a'
+
+ if 'end_time' in job_info:
+ end_time = datetime.datetime.utcfromtimestamp(job_info['end_time'])
+ execution_time = '%ds' % (end_time - start_time).total_seconds()
+ else:
+ execution_time = 'n/a'
+
+ if 'test_count' in job_info:
+ test_count = '%d' % job_info['test_count']
+ else:
+ test_count = 'n/a'
+
+ if 'pass_count' in job_info:
+ pass_count = '%d' % job_info['pass_count']
+ else:
+ pass_count = 'n/a'
+
+ if 'fail_count' in job_info:
+ fail_count = '%d' % job_info['fail_count']
+ else:
+ fail_count = 'n/a'
+
+ if 'skip_count' in job_info:
+ skip_count = '%d' % job_info['skip_count']
+ else:
+ skip_count = 'n/a'
+
+ if job_info['execution_state'] in ('pending', 'running'):
+ job_state = job_info['execution_state']
+ elif job_info['execution_state'] == 'done':
+ exit_clean = job_info.get('exit_clean')
+ if exit_clean is None:
+ job_state = 'unknown'
+ elif exit_clean is True:
+ job_state = 'completed'
+ elif exit_clean is False:
+ job_state = 'aborted'
+ else:
+ raise Exception('unhandled exit_clean: %s' % exit_clean)
+ else:
+ raise Exception('unhandled execution_state: %s' % job_info['execution_state'])
+
+ if execution_time != 'n/a':
+ execution_entry = '<a href="job-info/%s#timeline">%s</a>' % (
+ e(job_info['job_id'], quote=True), e(execution_time))
+ else:
+ execution_entry = e(execution_time)
+
+ if fail_count not in ('n/a', '0'):
+ fail_entry = '<a href="job-info/%s#failed-tests">%s</a>' % (
+ e(job_info['job_id'], quote=True), e(fail_count))
+ else:
+ fail_entry = e(fail_count)
+
+ if skip_count not in ('n/a', '0'):
+ skip_entry = '<a href="job-info/%s#skipped-tests">%s</a>' % (
+ e(job_info['job_id'], quote=True), e(skip_count))
+ else:
+ skip_entry = e(skip_count)
+
+ html.extend([
+ '<tr>',
+ '<td>%s</td>' % e(job_name),
+ '<td><a href="job-info/%s">%d</a></td>' % (
+ e(job_info['job_id'], quote=True), build_number),
+ '<td>%s</td>' % e(job_state),
+ '<td>%s</td>' % schedule_time.isoformat(),
+ '<td>%s</td>' % start_delay,
+ '<td>%s</td>' % execution_entry,
+ '<td>%s</td>' % test_count,
+ '<td>%s</td>' % pass_count,
+ '<td>%s</td>' % fail_entry,
+ '<td>%s</td>' % skip_entry,
+ '<td>%s</td>' % output,
+ '</tr>'
+ ])
+
+ html.append('</table>')
+
+ html.extend([
+ '</body>',
+ '</html>',
+ ])
+
+ return {
+ 'statusCode': 200,
+ 'headers': HTML_HEADERS,
+ 'body': ''.join(html),
+ }
+
+
+def render_job_info(job_table, test_result_table, job_id):
+ html = [
+ '<html>',
+ '<head><title>Job %s</title></head>' % e(job_id),
+ '<h1>Job %s</h1>' % e(job_id),
+ ]
+
+ res = job_table.get_item(Key={'job_id': job_id})
+ if 'Item' not in res:
+ return {
+ 'statusCode': 404,
+ 'headers': HTML_HEADERS,
+ 'body': '<html>job not found</html>',
+ }
+
+ job = res['Item']
+
+ schedule_time = datetime.datetime.utcfromtimestamp(job['schedule_time'])
+ if 'start_time' in job:
+ start_time = datetime.datetime.utcfromtimestamp(job['start_time']).isoformat()
+ else:
+ start_time = 'n/a'
+ if 'end_time' in job:
+ end_time = datetime.datetime.utcfromtimestamp(job['end_time']).isoformat()
+ else:
+ end_time = 'n/a'
+
+ html.extend([
+ '<table>',
+ '<tr><td>Repo:</td><td>%s</td>' % e(job['repo']),
+ '<tr><td>Node:</td><td>%s</td>' % e(job['node']),
+ '<tr><td>Name:</td><td>%s</td>' % e(job['job_name']),
+ '<tr><td>Scheduled At:</td><td>%s</td>' % e(schedule_time.isoformat()),
+ '<tr><td>Started At:</td><td>%s</td>' % e(start_time),
+ '<tr><td>Finished At:</td><td>%s</td>' % e(end_time),
+ '</table>',
+ ])
+
+ test_results = list(get_test_results(test_result_table, job_id))
+
+ if job.get('fail_count') not in (None, 0):
+ html.append('<h1 id="failed-tests">Failed Tests</h1>')
+
+ failed_tests = [t for t in test_results if t['result'] == 'failure']
+
+ html.append('<ul>')
+
+ for result in failed_tests:
+ html.append('<li><a href="#failure-%s">%s</a></li>' % (
+ e(result['test_name'], quote=True), e(result['test_name'])))
+
+ html.append('</ul>')
+
+ for result in failed_tests:
+ html.extend([
+ '<h2 id="failure-%s">%s</h2>' % (
+ e(result['test_name'], quote=True), e(result['test_name'])),
+ '<pre>%s</pre>' % e(result['diff'] or ''),
+ ])
+
+ if job.get('skip_count') not in (None, 0):
+ html.append('<h1 id="skipped-tests">Skipped Tests</h1>')
+ html.append('<ul>')
+
+ for result in test_results:
+ if result['result'] != 'skip':
+ continue
+
+ html.append('<li>%s</li>' % e(result['test_name']))
+
+ html.append('</ul>')
+
+ html.append('<h1 id="timeline">Timeline</h1>')
+
+ if test_results:
+ html.append('<svg height="%d" width="%d">' % (
+ len(test_results) * 20,
+ max(t['end'] for t in test_results) + 400,
+ ))
+
+ y_offset = 0
+
+ for result in sorted(test_results, key=lambda x: x['start']):
+ duration = result['end'] - result['start']
+
+ html.extend([
+ '<g transform="translate(%d, %d)">' % (
+ int(result['start']), y_offset),
+ # Need to add 1 otherwise 0 won't render.
+ '<rect width="%d" height="19"></rect>' % (int(duration) + 1),
+ '<text x="%d", y="9.5" dy="0.35em">%s (%.2fs)</text>' % (
+ int(duration) + 6, e(result['test_name']),
+ result['end'] - result['start']),
+ '</g>',
+ ])
+
+ y_offset += 20
+
+ html.append('</svg>')
+ else:
+ html.append('<p>No test results</p>')
+
+ html.append('</html>')
+
+ return {
+ 'statusCode': 200,
+ 'headers': HTML_HEADERS,
+ 'body': ''.join(html)
+ }
+
+
+def get_jobs_metdata(job_table, repo, node):
+ """Obtain jobs records for a revision."""
+ exclusive_start_key = None
+
+ while True:
+ # Passing ExclusiveStartKey=None doesn't work :(
+ extra = {}
+ if exclusive_start_key:
+ extra['ExclusiveStartKey'] = exclusive_start_key
+
+ res = job_table.scan(
+ Select='ALL_ATTRIBUTES',
+ FilterExpression='repo = :repo AND node = :node',
+ ExpressionAttributeValues={
+ ':repo': repo,
+ ':node': node,
+ },
+ **extra
+ )
+ for entry in res['Items']:
+ yield entry
+
+ if 'LastEvaluatedKey' not in res:
+ return
+
+ exclusive_start_key = res['LastEvaluatedKey']
+
+
+def get_test_results(test_result_table, job_id):
+ exclusive_start_key = None
+
+ while True:
+ extra = {}
+ if exclusive_start_key:
+ extra['ExclusiveStartKey'] = exclusive_start_key
+
+ res = test_result_table.query(
+ KeyConditionExpression=Key('job_id').eq(job_id),
+ Select='ALL_ATTRIBUTES',
+ **extra
+ )
+
+ for item in res['Items']:
+ yield item
+
+ if not res.get('LastEvaluatedKey'):
+ return
+
+ exclusive_start_key = res['LastEvaluatedKey']
+
+
+def push_info(push, repo_url):
+ cset_url = '%s/rev/%s' % (repo_url, push['node'])
+
+ return ''.join([
+ '<h2>Changeset <span class="mono"><a href="%s">%s</a></span></h2>' % (
+ e(cset_url, quote=True), e(push['node'])),
+ '<p>branch: <span class="mono">%s</span></p>' % e(push['branch']),
+ '<p>author: <span class="mono">%s</span></p>' % e(push['user']),
+ '<p>description: <span class="mono">%s</span></p>' % e(push['message'].splitlines()[0]),
+ ])
diff --git a/contrib/ci/lambda_functions/ci.py b/contrib/ci/lambda_functions/ci.py
new file mode 100644
--- /dev/null
+++ b/contrib/ci/lambda_functions/ci.py
@@ -0,0 +1,567 @@
+# ci.py - Lambda functions for Mercurial CI
+#
+# Copyright 2019 Gregory Szorc <gregory.szorc at gmail.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+# no-check-code because Python 3 native.
+
+import base64
+import datetime
+import decimal
+import json
+import os
+import time
+import urllib.request
+
+import boto3
+
+
+def handle_poll_repo(event, context):
+ """Handler for polling a repository and recording pushes."""
+ dynamodb = boto3.resource('dynamodb')
+ poll_table = dynamodb.Table(os.environ['DYNAMODB_REPO_POLL_TABLE'])
+ push_table = dynamodb.Table(os.environ['DYNAMODB_PUSH_TABLE'])
+
+ repo_url = os.environ['REPO_URL'].rstrip('/')
+ poll_revs = os.environ['POLL_REVS'].split()
+
+ poll_repo(poll_table, push_table, repo_url, poll_revs)
+
+
+def handle_schedule_from_push(event, context):
+ """Handler for scheduling CI for a repo node via DynamoDB changes."""
+ for record in event['Records']:
+ keys = record['dynamodb']['Keys']
+ print('received %s event for %s %s' % (
+ record['eventName'], keys['repo']['S'], keys['push_id']['S']))
+
+ if record['eventName'] != 'INSERT':
+ continue
+
+ record = record['dynamodb']['NewImage']
+ schedule_ci(record['repo_url']['S'], record['repo']['S'], record['node']['S'])
+
+
+def handle_pending_job(event, context):
+ """Handler for starting a job from an SQS message."""
+ ec2 = boto3.client('ec2')
+
+ for record in event['Records']:
+ body = record['body']
+
+ data = json.loads(body)
+ user_data_template = data['user_data_template']
+ user_data_params = data['user_data_params']
+ ec2_instance_config = data['ec2_instance_launch_config']
+
+ start_pending_job(ec2, user_data_template, user_data_params, ec2_instance_config)
+
+
+def handle_job_result_s3_artifact(event, context):
+ """Handler called when a new S3 object job artifact is uploaded."""
+ dynamodb = boto3.resource('dynamodb')
+ s3 = boto3.resource('s3')
+
+ job_table = dynamodb.Table(os.environ['DYNAMODB_JOB_TABLE'])
+ test_result_table = dynamodb.Table(os.environ['DYNAMODB_TEST_RESULT_TABLE'])
+
+ for record in event['Records']:
+ # We assume the key was uploaded to the proper location. This may
+ # not be safe. But to resolve the principal initiating the change or its
+ # IP address to an EC2 instance might be too expensive.
+ key = s3.Object(record['s3']['bucket']['name'], record['s3']['object']['key'])
+
+ process_job_artifact(job_table, test_result_table, key)
+
+
+def handle_instance_state_change(event, context):
+ instance_id = event['detail']['instance-id']
+ state = event['detail']['state']
+ print('received %s for %s' % (state, instance_id))
+
+ ec2 = boto3.resource('ec2')
+ dynamodb = boto3.resource('dynamodb')
+
+ instance = ec2.Instance(instance_id)
+
+ # We only care about events for ci-worker instances.
+ if not instance.iam_instance_profile:
+ print('no IAM instance profile defined; ignoring')
+ return
+
+ if not instance.iam_instance_profile['Arn'].endswith('/ci-worker'):
+ print('not a CI worker; ignoring')
+ return
+
+ job_table = dynamodb.Table(os.environ['DYNAMODB_JOB_TABLE'])
+
+ react_to_instance_state_change(job_table, instance, state)
+
+
+def next_build_number(job_table, repo, node, job_name):
+ """Find the next available build number for a job given its unique name."""
+
+ build_number = 0
+
+ res = job_table.scan(
+ ProjectionExpression='repo, node, job_name, build_number',
+ FilterExpression='repo = :repo AND node = :node AND job_name = :name',
+ ExpressionAttributeValues={
+ ':repo': repo,
+ ':node': node,
+ ':name': job_name,
+ },
+ )
+
+ for entry in res['Items']:
+ if entry['build_number'] >= build_number:
+ build_number = int(entry['build_number']) + 1
+
+ return build_number
+
+
+def poll_repo(poll_table, push_table, repo_url, poll_revs):
+ """Poll a repository for new changes and record them."""
+ repo_name = repo_url.split('/')[-1]
+
+ print('polling %s at %s' % (repo_name, repo_url))
+
+ new_state = {
+ 'last_poll': datetime.datetime.utcnow().isoformat(),
+ 'repo': repo_name,
+ 'repo_url': repo_url,
+ 'revs': {},
+ }
+
+ node_info = {}
+
+ for rev in poll_revs:
+ url = '%s/json-rev/%s' % (repo_url, rev)
+
+ req = urllib.request.Request(url, headers={'User-Agent': "Greg's Repo Poller"})
+ res = urllib.request.urlopen(req, timeout=10)
+ cset = json.load(res)
+
+ print('%s resolves to %s' % (rev, cset['node']))
+ new_state['revs'][rev] = cset['node']
+
+ node_info[cset['node']] = {
+ 'branch': cset['branch'],
+ 'user': cset['user'],
+ 'message': cset['desc'],
+ }
+
+ res = poll_table.get_item(Key={'repo': repo_name})
+ if 'Item' in res:
+ last_state = res['Item']
+ else:
+ last_state = {
+ 'revs': {},
+ }
+
+ for rev, new_node in sorted(new_state['revs'].items()):
+ old_node = last_state['revs'].get(rev)
+
+ if new_node == old_node:
+ continue
+
+ info = node_info[new_node]
+ print('revision %s updated; old=%s; new=%s' % (rev, old_node, new_node))
+
+ # Insert the push record into DynamoDB.
+ print('recording push in DynamoDB')
+ push_table.put_item(Item={
+ # Partition key.
+ 'repo': repo_name,
+ # Range key. Sort by date. Break ties by poll revision.
+ 'push_id': '%s-%s' % (new_state['last_poll'], rev),
+ 'repo_url': repo_url,
+ 'repo_name': repo_name,
+ 'poll_rev': rev,
+ 'push_date': new_state['last_poll'],
+ 'node': new_node,
+ 'branch': info['branch'],
+ 'user': info['user'],
+ 'message': info['message'],
+ })
+
+ print('updating poll state')
+ poll_table.put_item(Item=new_state)
+
+
+def schedule_ci(repo_url, repo, node):
+ print('scheduling CI for revision %s on %s' % (node, repo_url))
+ dynamodb = boto3.resource('dynamodb')
+ ec2 = boto3.resource('ec2')
+ s3 = boto3.resource('s3')
+ sqs = boto3.client('sqs')
+
+ job_table = dynamodb.Table(os.environ['DYNAMODB_JOB_TABLE'])
+ bucket = s3.Bucket(os.environ['S3_BUCKET'])
+ sqs_url = os.environ['SQS_URL']
+
+ for image in ec2.images.filter(Owners=['self']):
+ if image.name == 'hg-linux-dev-debian9':
+ schedule_linux_ci(job_table, sqs, sqs_url, bucket, repo_url, repo, node, image, 'debian9')
+ elif image.name == 'hg-linux-dev-ubuntu18.04':
+ schedule_linux_ci(job_table, sqs, sqs_url, bucket, repo_url, repo, node, image, 'ubuntu18.04')
+ elif image.name == 'hg-linux-dev-ubuntu18.10':
+ schedule_linux_ci(job_table, sqs, sqs_url, bucket, repo_url, repo, node, image, 'ubuntu18.10')
+ elif image.name == 'hg-linux-dev-ubuntu19.04':
+ schedule_linux_ci(job_table, sqs, sqs_url, bucket, repo_url, repo, node, image, 'ubuntu19.04')
+
+
+RUN_TESTS_LINUX = '''
+#!/bin/bash
+
+HG=/hgdev/venv-bootstrap/bin/hg
+
+cd /hgwork/src
+
+${HG} pull -r $2 $1
+${HG} log -r $2
+${HG} up $2
+
+export TMPDIR=/hgwork/tmp
+cd tests
+time $3 ./run-tests.py --json 2>&1 | tee output.log
+
+aws s3 cp --content-type text/plain --acl public-read output.log $4/output.log
+# The JSON file has a prefix to allow loading in web browsers.
+tail -c +13 report.json > report-truncated.json
+aws s3 cp --content-type application/json --acl public-read report-truncated.json $4/report.json
+'''.lstrip()
+
+
+LINUX_USER_DATA = '''
+#cloud-config
+
+# TAG Name ci-worker
+# TAG job_id {job_id}
+# TAG repo_url {repo_url}
+# TAG repo {repo}
+# TAG node {node}
+# TAG job_name {job_name}
+# TAG build_number {build_number}
+# TAG s3_bucket {s3_bucket}
+# TAG s3_prefix {s3_prefix}
+
+repo_update: false
+repo_upgrade: false
+
+write_files:
+ - path: /run-tests-linux
+ owner: hg:hg
+ permissions: '0755'
+ encoding: b64
+ content: {run_tests_linux_b64}
+
+runcmd:
+ - mkdir /hgwork
+ - mkdir /hgwork/tmp
+ - chown -R hg:hg /hgwork
+ - sudo -u hg -g hg rsync -a /hgdev/src /hgwork/
+ - sudo -u hg -g hg /run-tests-linux {repo_url} {node} {python} s3://{s3_bucket}/{s3_prefix} 2>&1 | tee /ci.log
+ - aws s3 cp --content-type text/plain --acl public-read /ci.log s3://{s3_bucket}/{s3_prefix}/ci.log
+ - echo done > done
+ - aws s3 cp --content-type text/plain --acl public-read done s3://{s3_bucket}/{s3_prefix}/done
+
+power_state:
+ delay: now
+ mode: poweroff
+
+'''.lstrip()
+
+
+def schedule_linux_ci(job_table, sqs, sqs_url, bucket, repo_url, repo, node, image, os_prefix):
+ block_device_mappings = [
+ {
+ 'DeviceName': image.block_device_mappings[0]['DeviceName'],
+ 'Ebs': {
+ 'DeleteOnTermination': True,
+ 'VolumeSize': 12,
+ 'VolumeType': 'gp2',
+ },
+ }
+ ]
+
+ run_tests_linux_b64 = base64.b64encode(RUN_TESTS_LINUX.encode('utf-8')).decode('ascii')
+
+ jobs = (
+ ('system-python2', '/usr/bin/python2'),
+ ('system-python3', '/usr/bin/python3'),
+ ('cpython-2.7', '/hgdev/pyenv/shims/python2.7'),
+ ('cpython-3.5', '/hgdev/pyenv/shims/python3.5'),
+ ('cpython-3.6', '/hgdev/pyenv/shims/python3.6'),
+ ('cpython-3.7', '/hgdev/pyenv/shims/python3.7'),
+ ('cpython-3.8', '/hgdev/pyenv/shims/python3.8'),
+ )
+
+ for job_name, python in jobs:
+ job_name = '%s-%s' % (os_prefix, job_name)
+ build_number = next_build_number(job_table, repo, node, job_name)
+ job_id = '%s-%s-%s-%d' % (repo, node, job_name, build_number)
+
+ bucket_prefix = 'jobs/%s/%s/%s/%d' % (repo, node, job_name, build_number)
+
+ # Unfortunately we cannot set tags on spot instance requests.
+ # So we encode tags in user data and parse these at launch time to
+ # turn into proper tags.
+
+ user_data_params = dict(
+ job_id=job_id,
+ repo=repo,
+ repo_url=repo_url,
+ node=node,
+ job_name=job_name,
+ build_number=build_number,
+ python=python,
+ run_tests_linux_b64=run_tests_linux_b64,
+ s3_bucket=bucket.name,
+ s3_prefix=bucket_prefix,
+ )
+
+ config = {
+ 'BlockDeviceMappings': block_device_mappings,
+ 'EbsOptimized': True,
+ 'IamInstanceProfile': {'Name': 'ci-worker'},
+ 'ImageId': image.id,
+ 'InstanceType': 'c5.9xlarge',
+ 'SecurityGroups': ['hg-linux-dev-1'],
+ }
+
+ job_params = {
+ 'user_data_template': LINUX_USER_DATA,
+ 'user_data_params': user_data_params,
+ 'ec2_instance_launch_config': config,
+ }
+
+ schedule_time = decimal.Decimal(time.time())
+
+ print('registering job in DynamoDB')
+ job_table.put_item(Item={
+ 'job_id': job_id,
+ 'repo': repo,
+ 'node': node,
+ 'job_name': job_name,
+ 'build_number': build_number,
+ 'execution_state': 'pending',
+ 'schedule_time': schedule_time,
+ })
+
+ print('adding job to pending queue')
+ sqs.send_message(
+ QueueUrl=sqs_url,
+ MessageBody=json.dumps(job_params, sort_keys=True)
+ )
+
+
+def start_pending_job(ec2, user_data_template, user_data_params, ec2_instance_config):
+ """Called to request the start of a pending job."""
+ user_data = user_data_template.format(**user_data_params)
+
+ print('requesting spot instance for job %s' % user_data_params['job_id'])
+
+ launch_spec = dict(ec2_instance_config)
+ launch_spec['UserData'] = base64.b64encode(user_data.encode('utf-8')).decode('utf-8')
+
+ # Spot instances are substantially cheaper but can be terminated at will
+ # by Amazon. That's fine. We're a CI system. If the instance is terminated,
+ # we can just retry the job.
+ #
+ # The max bid price is the on-demand price. So in the typical case we save
+ # $$$. If we're unlucky we pay the on-demand rate. You can't lose.
+ ec2.request_spot_instances(
+ BlockDurationMinutes=60,
+ ValidUntil=datetime.datetime.utcnow() + datetime.timedelta(minutes=10),
+ LaunchSpecification=launch_spec,
+ )
+
+
+def react_to_instance_state_change(job_table, instance, state):
+ """React to a CI worker instance state change."""
+ now = decimal.Decimal(time.time())
+
+ # CI workers advertise their job info via tags. However, the tags cannot
+ # be set for spot instances and are instead encoded in user data. So when
+ # a spot instance starts, detect that here and set the tags so they can be
+ # seen by future handlers.
+ tags = {t['Key']: t['Value'] for t in instance.tags or []}
+
+ if state == 'pending' and 'job_id' not in tags:
+ print('fetching UserData to parse tags')
+ user_data = instance.describe_attribute(Attribute='userData')['UserData']['Value']
+ user_data = base64.b64decode(user_data.encode('utf-8')).decode('utf-8')
+
+ set_tags = []
+
+ for line in user_data.splitlines():
+ if not line.startswith('# TAG '):
+ continue
+
+ kv = line[len('# TAG '):].strip()
+ name, value = kv.split(' ', 1)
+ tags[name] = value
+ set_tags.append({
+ 'Key': name,
+ 'Value': value,
+ })
+
+ if set_tags:
+ print('setting new tags on instance %s: %s' % (instance.instance_id, set_tags))
+ instance.create_tags(Tags=set_tags)
+
+ job_id = tags['job_id']
+
+ # New instance/job seen. Record that.
+ if state == 'pending':
+ # Spot instances can't have tags at launch time. So we encode tags in user
+ # data, where they can always be parsed.
+
+ print('recording running state for job %s' % job_id)
+ job_table.update_item(
+ Key={'job_id': job_id},
+ UpdateExpression=(
+ 'set execution_state = :state, '
+ 'instance_id = :instance_id, '
+ 'start_time = :start_time, '
+ 'exit_clean = :exit_clean'
+ ),
+ ExpressionAttributeValues={
+ ':state': 'running',
+ ':instance_id': instance.instance_id,
+ ':start_time': now,
+ ':exit_clean': False,
+ },
+ )
+ return
+
+ elif state != 'shutting-down':
+ return
+
+ # Instance is shutting down. Job is done. Update the state change
+ # and index results from S3.
+ print('recording finished results from job %s' % job_id)
+
+ job_table.update_item(
+ Key={'job_id': job_id},
+ UpdateExpression='set execution_state = :state, end_time = :end_time',
+ ExpressionAttributeValues={
+ ':state': 'done',
+ ':end_time': now,
+ },
+ )
+
+
+def process_job_artifact(job_table, test_result_table, key):
+ """Process an S3 key representing a job artifact."""
+ print('processing S3 object %s' % key.key)
+
+ # `key` should be `jobs/<repo>/<node>/<job name>/<build number>/<artifact>
+ parts = key.key.split('/')
+ if parts[0] != 'jobs':
+ print('ignoring artifact not tied to a specific job: %s' % key.key)
+ return
+
+ if len(parts) < 6:
+ print('key does not have enough parts: %s; ignoring' % key.key)
+ return
+
+ repo, node, job_name, build_number = parts[1:5]
+ artifact_name = '/'.join(parts[5:])
+
+ job_id = '%s-%s-%s-%s' % (repo, node, job_name, build_number)
+
+ # Verify the job ID is known.
+ res = job_table.get_item(Key={'job_id': job_id}, ProjectionExpression='instance_id')
+ if 'Item' not in res:
+ print('unable to find job id (%s) for artifact: %s' % (job_id, key.key))
+ return
+
+ if artifact_name == 'report.json':
+ process_report_json(job_table, test_result_table, job_id, repo, node,
+ job_name, key)
+ elif artifact_name == 'output.log':
+ output_log_url = '%s/%s/%s' % (
+ key.meta.client.meta.endpoint_url,
+ key.Bucket().name,
+ key.key,
+ )
+
+ job_table.update_item(
+ Key={'job_id': job_id},
+ UpdateExpression='set output_log_url = :url',
+ ExpressionAttributeValues={':url': output_log_url},
+ )
+ # This is written when the task shuts down cleanly.
+ elif artifact_name == 'done':
+ job_table.update_item(
+ Key={'job_id': job_id},
+ UpdateExpression='set exit_clean = :c',
+ ExpressionAttributeValues={':c': True},
+ )
+ else:
+ print('ignoring artifact %s' % artifact_name)
+
+
+def process_report_json(job_table, test_result_table, job_id, repo, node,
+ job_name, key):
+ """Process a `report.json` file emitted from Mercurial's test harness."""
+ print('retrieving S3 object %s' % key.key)
+ results = json.load(key.get()['Body'])
+
+ overall = all(v['result'] in ('success', 'skip') for v in results.values())
+
+ test_count = 0
+ pass_count = 0
+ skipped = set()
+ failed = set()
+
+ for k, v in results.items():
+ test_count += 1
+
+ if v['result'] == 'success':
+ pass_count += 1
+ elif v['result'] == 'skip':
+ skipped.add(k)
+ else:
+ failed.add(k)
+
+ # Record job metadata.
+ job_table.update_item(
+ Key={'job_id': job_id},
+ UpdateExpression=(
+ 'set overall_result = :overall_result,'
+ 'test_count = :test_count, '
+ 'pass_count = :pass_count, '
+ 'skip_count = :skip_count, '
+ 'fail_count = :fail_count'
+ ),
+ ExpressionAttributeValues={
+ ':overall_result': overall,
+ ':test_count': test_count,
+ ':pass_count': pass_count,
+ ':skip_count': len(skipped),
+ ':fail_count': len(failed),
+ },
+ )
+
+ # And write each test result into the results table.
+ with test_result_table.batch_writer() as batch:
+ for k, v in results.items():
+ v['job_id'] = job_id
+ v['repo'] = repo
+ v['node'] = node
+ v['job_name'] = job_name
+ v['test_name'] = k
+
+ # Empty strings are not allowed. Normalize to None.
+ v['diff'] = v['diff'] or None
+
+ # Normalize float strings to floats.
+ for kk in ('csys', 'cuser', 'end', 'start', 'time'):
+ if v[kk]:
+ v[kk] = decimal.Decimal(v[kk])
+
+ batch.put_item(Item=v)
diff --git a/contrib/ci/README.rst b/contrib/ci/README.rst
new file mode 100644
--- /dev/null
+++ b/contrib/ci/README.rst
@@ -0,0 +1,241 @@
+============
+Mercurial CI
+============
+
+This directory defines a CI system for the Mercurial Project.
+
+Architecture
+============
+
+The core of the CI system is built on top of Amazon Web Services. It
+consists of a number of *serverless* components which monitor the
+Mercurial Project, react to changes, store and provide access to
+results, etc.
+
+The CI system consists of the following primary components:
+
+* Data storage in DynamoDB and S3.
+* Lightweight compute via Lambda functions.
+* Execution of heavyweight tasks (e.g. running tests) on ephemeral EC2
+ instances.
+* HTTP API and HTML endpoint via API Gateway / Lambda functions.
+
+In addition, the following AWS resources are utilized:
+
+* IAM for defining roles and access policies for various entities.
+* SQS for event queuing.
+* CloudWatch for logging.
+
+Components
+==========
+
+Storage
+-------
+
+The CI system utilizes DynamoDB and S3 for storage. DynamoDB is utilized
+as a fast source-of-truth system for most data. S3 is mostly utilized as
+a temporary storage location for larger artifacts, such as the raw output
+logs for individual jobs.
+
+The Terraform code for this component lives in ``storage.tf``.
+
+Repo Poller
+-----------
+
+The system monitors configured Mercurial repositories for new pushes. It
+does this by periodically invoking a Lambda function via a CloudWatch event
+trigger. This Lambda function will make HTTP requests to the monitored
+repository and then record any observed changes to DynamoDB.
+
+The Terraform code for this component lives in ``repo_poll.tf``.
+
+Repo Change Reactor
+-------------------
+
+The *repo change reactor* is a component responsible for reacting to
+repository change events (e.g. repository pushes).
+
+The primary role of this component is to determine what work needs to be
+done and to schedule it.
+
+This component consists of a Lambda function which is triggered by DynamoDB
+changes initiated by the *Repo Poller*. This function will inspect the
+state of the world and generate pending *jobs* for that change event.
+It will add new job records to DynamoDB and will register ready-to-run
+jobs on an SQS queue.
+
+The Terraform code for this component lives in ``repo_change_reactor.tf``.
+
+Job Executor
+------------
+
+The *job executor* is a component responsible for executing and managing
+pending jobs. It essentially takes the work queued by the *Repo Change
+Reactor* and sets it in action.
+
+The *job executor* mainly consists of some Lambda functions.
+
+The *run pending job* Lambda function is subscribed to the SQS queue
+for pending jobs. When this function is invoked, it attempts to launch
+a new instance to run the requested job, whose launch configuration is
+defined in data in the SQS message. If the launch is successful, the
+new instance boots up and starts work on its own. If not, the Lambda
+function raises an exception and the message is placed back in the
+SQS queue, where it will be automatically retried later. Eventually,
+a message *times out* and will be dropped.
+
+This component also contains an *instance state change* Lambda function.
+This function is invoked whenever an EC2 instance's state change. e.g.
+when a new instance is created, stops, or terminates. The role of this
+function is to keep accounting for running jobs and instances up-to-date.
+This function will update job records in DynamoDB to record that a
+job has started/finished/aborted.
+
+The Terraform code for this component lives in ``job_executor.tf``.
+
+Worker
+------
+
+The *worker* component is the entity doing compute-heavy work. It is
+essentially a single EC2 instance.
+
+The Terraform code for this component lives in ``worker.tf``.
+
+Job Result Reactor
+------------------
+
+The *Job Result Reactor* component centers around a Lambda function
+which is called when an S3 object is created. The role of this component
+is to react to artifacts uploaded by individual jobs.
+
+The Lambda function receives information about the uploaded S3 object.
+From this, the job ID can be derived and job state as well as test
+result state can be updated in DynamoDB.
+
+The Terraform code for this component lives in ``job_result_reactor.tf``.
+
+Web
+---
+
+The *web* component provides an HTTP endpoint for the CI system.
+
+The main functionality for the *web* component is implemented by a
+Lambda function, which is fronted by an API Gateway, which is configured
+to use a hostname backed by a legitimate x509 certificate so the
+service can be publicly exposed and will work well with browsers.
+
+The Terraform code for this component lives in ``web.tf``.
+
+AWS Account Management
+======================
+
+We currently assume that the AWS account used to run Mercurial CI
+is operated within a larger organization.
+
+Terraform references resources in this parent account. Specifically,
+the resources in ``account_manager.tf`` allow a Lambda function in
+this parent account to assume a role in this account which gives it
+unlimited powers. What's happening here is the parent account
+periodically scans this account for misbehavior or other activities
+which could cause badness, such as large cost expenditures. The
+account manager in the parent account does things like terminate
+long-running EC2 instances.
+
+Implementation Notes
+====================
+
+Instance Lifecycle
+------------------
+
+Each job is executed on a brand new EC2 instance. Once that job is
+done, the instance shuts down and terminates.
+
+This is a good model from security and determinism perspectives because
+we don't have to worry about previous state on the instance since each
+instance is launched from a known initial state (an AMI). As long as the
+IAM instance policy is locked down, even malicious code can't do much
+in terms of privileged operations - just generic compute. Generic compute
+can still be abused of course. But there's no way for us to prevent that
+abuse: CI is RCE as a service.
+
+A downside to new instances for every job is that there is a non-trivial
+amount of overhead to obtain an instance and boot into its OS - often a
+few dozen seconds. Ideally we'd have a pool of idle workers waiting to
+run new jobs.
+
+But there's a significant reason we use ephemeral, on-demand instances:
+cost. If we had instances sitting around, they'd accrue a substantial
+bill. On EC2, you are billed by the second for instance use (at least
+for most Linux AMIs - Windows and Linux distros with a support contract
+are billed by the hour). As long as the overhead for instance startup
+and shutdown are minimal and the overall utilization of the CI system
+is low, we still out drastically ahead by launching instances on demand.
+
+Another benefit of launching a new instance per job is that we can scale
+out to infinite job parallelism. Instead of managing the worker pool size,
+we can just spin up instances when they are needed and dispose of them
+after. And all jobs will complete sooner.
+
+Since our instances are very transient, we use EC2 spot instances. This
+allows us to cut substantial costs versus the on-demand rate at most
+times. And the EC2 spot instances we use are rarely terminated by Amazon,
+so our failure rate for utilizing them is slow. This is well worth the
+trade-off.
+
+Known Issues and Limitations
+----------------------------
+
+Job running current knows nothing about building AMIs. We should teach
+the system to build AMIs using Mercurial's in-repo functionality for
+doing so. This will require teaching the in-repo code to not purge old
+AMIs, as this could create issues when multiple branches have CI
+triggered at the same time. It should be possible to add a tag to the
+AMI to denote an expiration time or some such so that we can purge old
+or unused AMIs automatically.
+
+There's no machine API for accessing CI results. We should definitely
+build a JSON API or something along those lines.
+
+The ``result.json`` file produced by Mercurial's test harness doesn't
+contain as much detail as the raw output. Notably missing is the skip
+reason and annotations for test failures when there is no diff
+(e.g. timeout). We should extend the format to contain this metadata.
+
+We have no mechanism to retrigger a job. This requires some form of
+authentication to prevent abuse.
+
+We have no mechanism to trigger CI on arbitrary diffs. We would like
+to provide some kind of *try server* where you can submit a diff and
+the system builds it. Again, this requires some form of authentication.
+
+We have no mechanism to choose which jobs to execute. We probably want
+to build this because there is no need to execute all jobs all the time.
+
+Development Workflow
+====================
+
+The Terraform code and current production environment currently assumes
+it is running in an AWS account attached to Gregory Szorc's main AWS
+account. The details of this account can be found in the ``init.tf`` file.
+
+Terraform requires Terraform 0.12 or newer. You will also need to invoke
+``terraform`` such that it can locate AWS credentials with admin
+privileges to the AWS account being provisioned. Typically, you configure
+``~/.aws/config`` and ``~/.aws/credentials`` to define a profile plus
+credentials and set e.g. ``AWS_PROFILE=hg`` to use the ``hg`` AWS profile
+with Terraform.
+
+Assuming the AWS credentials are set up::
+
+ $ cd terraform
+ $ terraform init
+ $ terraform apply
+
+The ``terraform/`` directory contains all the AWS resources, including
+triggers between various resources. (e.g. changes invoking Lambda
+functions).
+
+Most of the business logic lives in Lambda functions. These can be found
+in the ``lambda_functions/`` directory. A typical workflow is to make
+changes then run ``terraform apply``, ideally against a development AWS
+profile.
diff --git a/.hgignore b/.hgignore
--- a/.hgignore
+++ b/.hgignore
@@ -30,6 +30,7 @@
tests/htmlcov
build
contrib/chg/chg
+contrib/ci/terraform/.terraform
contrib/hgsh/hgsh
contrib/vagrant/.vagrant
dist
To: indygreg, #hg-reviewers
Cc: mercurial-devel
More information about the Mercurial-devel
mailing list