Skip to content

Commit 030c6bd

Browse files
authored
feat: regional API MIG (#2490)
* feat(iac): add regional API MIG alongside zonal pool Introduces an `api-cluster` module that provisions the API nodepool as a regional MIG (BALANCED distribution) in parallel with the existing zonal `google_compute_instance_group_manager.api_pool`. Load balancer backends are unchanged and still point at the zonal pool; the regional pool is staged for a follow-up cutover. Also drops the `datacenters = ["${gcp_zone}"]` constraint from the docker-reverse-proxy job so it can schedule on API nodes regardless of zone once the regional MIG is live. * feat(iac): wire regional API MIG into LB backends Backs the four API-facing LB backends (session, api, docker-reverse-proxy, ingress) with both the zonal and regional instance groups so traffic is served by both pools during cutover. Also parameterizes the regional MIG's surge/unavailable counts off the actual zone count in the region (mirrors the server pool pattern), instead of hardcoding 3. * chore: remove datacenters completely * chore: improve name * fix: remove nomad api check conflict * chore: set 0 unavaialbe * chore: remove unnecessary gcp zone * refactor(iac): move api nodepool module under provider-gcp/modules Match the layout convention used by provider-aws/modules so the API nodepool module lives alongside its scripts and is reusable across envs without further state migration.
1 parent e870251 commit 030c6bd

13 files changed

Lines changed: 403 additions & 10 deletions

File tree

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
locals {
2+
startup_script = templatefile("${path.module}/scripts/start-api.sh", {
3+
CLUSTER_TAG_NAME = var.cluster_tag_name
4+
SCRIPTS_BUCKET = var.cluster_setup_bucket_name
5+
GCP_REGION = var.gcp_region
6+
GOOGLE_SERVICE_ACCOUNT_KEY = var.google_service_account_key
7+
CONSUL_TOKEN = var.consul_acl_token_secret
8+
RUN_CONSUL_FILE_HASH = var.file_hash["scripts/run-consul.sh"]
9+
RUN_NOMAD_FILE_HASH = var.file_hash["scripts/run-nomad.sh"]
10+
CONSUL_GOSSIP_ENCRYPTION_KEY = var.consul_gossip_encryption_key_secret_data
11+
CONSUL_DNS_REQUEST_TOKEN = var.consul_dns_request_token_secret_data
12+
NODE_POOL = var.node_pool
13+
})
14+
}
15+
16+
resource "google_compute_health_check" "nomad_check" {
17+
name = "${var.cluster_name}-nomad-check"
18+
check_interval_sec = 15
19+
timeout_sec = 10
20+
healthy_threshold = 2
21+
unhealthy_threshold = 10 # 50 seconds
22+
23+
log_config {
24+
enable = true
25+
}
26+
27+
http_health_check {
28+
request_path = "/v1/agent/health"
29+
port = var.nomad_port
30+
}
31+
}
32+
33+
data "google_compute_zones" "region_zones" {
34+
region = var.gcp_region
35+
}
36+
37+
resource "google_compute_region_instance_group_manager" "pool" {
38+
name = "${var.cluster_name}-rig"
39+
region = var.gcp_region
40+
41+
version {
42+
name = google_compute_instance_template.template.id
43+
instance_template = google_compute_instance_template.template.id
44+
}
45+
46+
named_port {
47+
name = var.client_proxy_health_port.name
48+
port = var.client_proxy_health_port.port
49+
}
50+
51+
named_port {
52+
name = var.client_proxy_port.name
53+
port = var.client_proxy_port.port
54+
}
55+
56+
named_port {
57+
name = var.api_port.name
58+
port = var.api_port.port
59+
}
60+
61+
named_port {
62+
name = var.docker_reverse_proxy_port.name
63+
port = var.docker_reverse_proxy_port.port
64+
}
65+
66+
named_port {
67+
name = var.ingress_port.name
68+
port = var.ingress_port.port
69+
}
70+
71+
auto_healing_policies {
72+
health_check = google_compute_health_check.nomad_check.id
73+
initial_delay_sec = 600
74+
}
75+
76+
distribution_policy_target_shape = "BALANCED"
77+
78+
# Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be
79+
# a rolling update.
80+
update_policy {
81+
type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC"
82+
minimal_action = "REPLACE"
83+
max_surge_percent = null
84+
max_unavailable_percent = null
85+
replacement_method = "SUBSTITUTE"
86+
87+
max_surge_fixed = length(data.google_compute_zones.region_zones.names)
88+
max_unavailable_fixed = 0
89+
90+
instance_redistribution_type = "NONE"
91+
}
92+
93+
base_instance_name = var.cluster_name
94+
target_size = var.cluster_size
95+
target_pools = []
96+
97+
depends_on = [
98+
google_compute_instance_template.template,
99+
]
100+
}
101+
102+
data "google_compute_image" "source_image" {
103+
family = var.image_family
104+
}
105+
106+
resource "google_compute_instance_template" "template" {
107+
name_prefix = "${var.cluster_name}-"
108+
109+
instance_description = null
110+
machine_type = var.machine_type
111+
112+
labels = merge(
113+
var.labels,
114+
(var.environment != "dev" ? {
115+
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
116+
} : {})
117+
)
118+
tags = [var.cluster_tag_name]
119+
metadata_startup_script = local.startup_script
120+
metadata = merge(
121+
{ api_cluster = "TRUE" },
122+
{
123+
enable-osconfig = "TRUE",
124+
enable-guest-attributes = "TRUE",
125+
},
126+
)
127+
128+
scheduling {
129+
on_host_maintenance = "MIGRATE"
130+
}
131+
132+
disk {
133+
boot = true
134+
source_image = data.google_compute_image.source_image.id
135+
disk_size_gb = 200
136+
disk_type = var.boot_disk_type
137+
}
138+
139+
network_interface {
140+
network = var.network_name
141+
142+
dynamic "access_config" {
143+
for_each = var.api_use_nat ? [] : ["public_ip"]
144+
content {}
145+
}
146+
}
147+
148+
# For a full list of oAuth 2.0 Scopes, see https://developers.google.com/identity/protocols/googlescopes
149+
service_account {
150+
email = var.google_service_account_email
151+
scopes = [
152+
"userinfo-email",
153+
"compute-ro",
154+
"https://www.googleapis.com/auth/logging.write",
155+
"https://www.googleapis.com/auth/monitoring.write",
156+
"https://www.googleapis.com/auth/trace.append",
157+
"https://www.googleapis.com/auth/cloud-platform"
158+
]
159+
}
160+
161+
# Per Terraform Docs (https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#using-with-instance-group-manager),
162+
# we need to create a new instance template before we can destroy the old one. Note that any Terraform resource on
163+
# which this Terraform resource depends will also need this lifecycle statement.
164+
lifecycle {
165+
create_before_destroy = true
166+
167+
# TODO: Temporary workaround to avoid unnecessary updates to the instance template.
168+
# This should be removed once cluster size is removed from the metadata
169+
ignore_changes = [metadata]
170+
}
171+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
output "instance_group" {
2+
description = "Self-link of the regional instance group, for use as a load balancer backend."
3+
value = google_compute_region_instance_group_manager.pool.instance_group
4+
}

iac/provider-gcp/nomad-cluster/scripts/start-api.sh renamed to iac/provider-gcp/modules/nodepool-api/scripts/start-api.sh

File renamed without changes.
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
variable "gcp_region" {
2+
type = string
3+
}
4+
5+
variable "gcp_zone" {
6+
description = "GCP zone used for labels (ops agent policy). Does not restrict MIG placement."
7+
type = string
8+
}
9+
10+
variable "network_name" {
11+
type = string
12+
}
13+
14+
variable "cluster_tag_name" {
15+
description = "Network tag applied to cluster instances for firewall rules and Consul auto-discovery."
16+
type = string
17+
}
18+
19+
variable "cluster_name" {
20+
description = "Name of the cluster (used as base_instance_name and resource name prefix)."
21+
type = string
22+
}
23+
24+
variable "cluster_size" {
25+
type = number
26+
27+
validation {
28+
condition = var.cluster_size >= 1
29+
error_message = "Cluster size must be at least 1."
30+
}
31+
}
32+
33+
variable "machine_type" {
34+
type = string
35+
}
36+
37+
variable "image_family" {
38+
description = "GCE image family for the API instances."
39+
type = string
40+
}
41+
42+
variable "boot_disk_type" {
43+
description = "GCE boot disk type for the API instances."
44+
type = string
45+
}
46+
47+
variable "api_use_nat" {
48+
description = "Whether API nodes should route outbound traffic through NAT (no external IPs)."
49+
type = bool
50+
}
51+
52+
# ---------------------------------------------------------------------------------------------------------------------
53+
# LOAD BALANCER NAMED PORTS
54+
# ---------------------------------------------------------------------------------------------------------------------
55+
56+
variable "api_port" {
57+
type = object({
58+
name = string
59+
port = number
60+
health_path = string
61+
})
62+
}
63+
64+
variable "client_proxy_health_port" {
65+
type = object({
66+
name = string
67+
port = number
68+
path = string
69+
})
70+
}
71+
72+
variable "client_proxy_port" {
73+
type = object({
74+
name = string
75+
port = number
76+
})
77+
}
78+
79+
variable "ingress_port" {
80+
type = object({
81+
name = string
82+
port = number
83+
health_path = string
84+
})
85+
}
86+
87+
variable "docker_reverse_proxy_port" {
88+
type = object({
89+
name = string
90+
port = number
91+
health_path = string
92+
})
93+
}
94+
95+
# ---------------------------------------------------------------------------------------------------------------------
96+
# SERVICE ACCOUNT & AUTHENTICATION
97+
# ---------------------------------------------------------------------------------------------------------------------
98+
99+
variable "google_service_account_email" {
100+
type = string
101+
}
102+
103+
variable "google_service_account_key" {
104+
type = string
105+
sensitive = true
106+
}
107+
108+
# ---------------------------------------------------------------------------------------------------------------------
109+
# NOMAD & CONSUL CONFIGURATION
110+
# ---------------------------------------------------------------------------------------------------------------------
111+
112+
variable "nomad_port" {
113+
type = number
114+
}
115+
116+
variable "consul_acl_token_secret" {
117+
type = string
118+
sensitive = true
119+
}
120+
121+
variable "consul_gossip_encryption_key_secret_data" {
122+
type = string
123+
sensitive = true
124+
}
125+
126+
variable "consul_dns_request_token_secret_data" {
127+
type = string
128+
sensitive = true
129+
}
130+
131+
variable "node_pool" {
132+
description = "Nomad node pool name for API workloads."
133+
type = string
134+
}
135+
136+
# ---------------------------------------------------------------------------------------------------------------------
137+
# STORAGE BUCKETS
138+
# ---------------------------------------------------------------------------------------------------------------------
139+
140+
variable "cluster_setup_bucket_name" {
141+
description = "GCS bucket containing the run-nomad.sh / run-consul.sh setup scripts."
142+
type = string
143+
}
144+
145+
# ---------------------------------------------------------------------------------------------------------------------
146+
# DEPLOYMENT METADATA
147+
# ---------------------------------------------------------------------------------------------------------------------
148+
149+
variable "environment" {
150+
type = string
151+
validation {
152+
condition = contains(["dev", "staging", "prod"], var.environment)
153+
error_message = "Environment must be one of: dev, staging, prod"
154+
}
155+
}
156+
157+
variable "labels" {
158+
type = map(string)
159+
}
160+
161+
variable "file_hash" {
162+
description = "Map of setup script file paths to their content hashes for versioning."
163+
type = map(string)
164+
}

0 commit comments

Comments
 (0)