-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMakefile
More file actions
138 lines (117 loc) · 7.08 KB
/
Makefile
File metadata and controls
138 lines (117 loc) · 7.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Romansh-LLM: run data and training scripts from repo root.
# Use ENV=dev or ENV=prod to select config and Terraform environment (default: prod).
.PHONY: download-data pretrain all help
.PHONY: tf-init tf-plan tf-apply tf-destroy tf-output tf-lint
.PHONY: docker-build docker-push sagemaker-launch aws-pretrain download-model job-status job-logs
.PHONY: install-pre-commit pre-commit
ENV ?= prod
CONFIG ?= configs/$(ENV).yaml
SCRIPTS := scripts
TF_DIR := terraform
TF_VAR_ENV = -var=environment=$(ENV)
help:
@echo "Romansh-LLM targets (ENV=dev or prod, default: prod):"
@echo " make download-data Cache ZurichNLP/quotidiana from Hugging Face"
@echo " make pretrain Run CPT with QLoRA (ENV=$(ENV), CONFIG=$(CONFIG))"
@echo " make all download-data then pretrain"
@echo ""
@echo "Terraform (SageMaker infra; separate per ENV):"
@echo " make tf-init terraform init"
@echo " make tf-plan terraform plan (ENV=$(ENV))"
@echo " make tf-apply terraform apply (ENV=$(ENV))"
@echo " make tf-destroy terraform destroy (ENV=$(ENV))"
@echo " make tf-output show terraform outputs"
@echo " make tf-lint terraform fmt/validate, tflint, checkov (requires: terraform, tflint, uv sync --extra dev)"
@echo ""
@echo "Docker & SageMaker:"
@echo " make docker-build Build training image (local)"
@echo " make docker-push Push image to ECR (requires tf-apply for current ENV)"
@echo " make sagemaker-launch Start CPT job (ENV=$(ENV); optional: HF_TOKEN, INSTANCE_TYPE)"
@echo " make aws-pretrain Full AWS flow for ENV (optional: YES=1, SKIP_TERRAFORM=1, SKIP_PUSH=1)"
@echo " make download-model Download model from SageMaker S3 (requires JOB_NAME=...)"
@echo " make job-status Check SageMaker training job status (requires JOB_NAME=...)"
@echo " make job-logs Stream training job logs from CloudWatch (requires JOB_NAME=...; Ctrl+C to stop)"
@echo ""
@echo "Development:"
@echo " make install-pre-commit Install pre-commit hooks (run once)"
@echo " make pre-commit Run pre-commit on all files"
@echo ""
@echo "Examples: make pretrain ENV=dev | make aws-pretrain ENV=dev | make download-model JOB_NAME=... | make job-status JOB_NAME=... | make job-logs JOB_NAME=..."
download-data:
$(SCRIPTS)/download_data.sh
pretrain:
CONFIG=$(CONFIG) $(SCRIPTS)/pretrain.sh
all: download-data pretrain
tf-init:
terraform -chdir=$(TF_DIR) init
tf-plan: tf-init
terraform -chdir=$(TF_DIR) plan -out=tfplan $(TF_VAR_ENV)
tf-apply:
terraform -chdir=$(TF_DIR) apply $(TF_VAR_ENV)
tf-destroy:
terraform -chdir=$(TF_DIR) destroy $(TF_VAR_ENV)
tf-output:
terraform -chdir=$(TF_DIR) output
tf-lint: tf-init
@echo "Running terraform fmt -check..."
@terraform -chdir=$(TF_DIR) fmt -check -recursive
@echo "Running terraform validate..."
@terraform -chdir=$(TF_DIR) validate
@echo "Running tflint..."
@command -v tflint >/dev/null 2>&1 || { echo "Error: tflint not found. Install it (e.g. brew install tflint) and run 'tflint --init' in $(TF_DIR)/."; exit 1; }; \
tflint -f compact --chdir=$(TF_DIR)
@echo "Running checkov..."
@uv run checkov -d $(TF_DIR) --framework terraform --quiet
docker-build:
docker build --platform linux/amd64 -t romansh-llm .
docker-push: docker-build
@reg=$$(terraform -chdir=$(TF_DIR) output -raw ecr_repository_url 2>/dev/null | cut -d/ -f1); \
uri=$$(terraform -chdir=$(TF_DIR) output -raw ecr_image_uri 2>/dev/null); \
if [ -z "$$uri" ]; then echo "Run 'make tf-apply ENV=$(ENV)' first."; exit 1; fi; \
uv run aws ecr get-login-password --region $$(terraform -chdir=$(TF_DIR) output -raw region) | docker login --username AWS --password-stdin $$reg; \
docker tag romansh-llm:latest $$uri; \
docker push $$uri
download-model:
@if [ -z "$(JOB_NAME)" ]; then echo "Error: pass JOB_NAME=... (the SageMaker training job name)."; exit 1; fi; \
region=$${AWS_REGION:-$$AWS_DEFAULT_REGION}; [ -z "$$region" ] && region=$$(aws configure get region 2>/dev/null); \
if [ -z "$$region" ]; then echo "Error: set AWS_REGION or run 'aws configure'."; exit 1; fi; \
account=$$(aws sts get-caller-identity --query Account --output text 2>/dev/null); \
if [ -z "$$account" ]; then echo "Error: AWS credentials not configured (run 'uv run aws configure')."; exit 1; fi; \
s3_uri="s3://sagemaker-$$region-$$account/output/$(JOB_NAME)/output/model.tar.gz"; \
dest_dir="output/sagemaker/$(JOB_NAME)"; \
mkdir -p "$$dest_dir"; \
echo "Downloading $$s3_uri to $$dest_dir/..."; \
uv run aws s3 cp "$$s3_uri" "$$dest_dir/model.tar.gz" && \
(cd "$$dest_dir" && tar -xzf model.tar.gz && echo "Unpacked model to $$dest_dir/final/") || \
{ echo "Download failed. Check JOB_NAME and that the job has completed."; exit 1; }
job-status:
@if [ -z "$(JOB_NAME)" ]; then echo "Error: pass JOB_NAME=... (the SageMaker training job name)."; exit 1; fi; \
echo "Training job: $(JOB_NAME)"; echo ""; \
r=$$(uv run aws sagemaker describe-training-job --training-job-name "$(JOB_NAME)" \
--query '[TrainingJobStatus,SecondaryStatus,CreationTime,TrainingEndTime,FailureReason]' --output text 2>/dev/null) || exit 1; \
IFS=$$'\t' read -r status secondary started ended failure <<< "$$r"; \
started_fmt=$$( [ "$$started" = "None" ] && echo "None" || (date -r "$${started%%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -d "@$${started%%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null) || echo "$$started"); \
ended_fmt=$$( [ "$$ended" = "None" ] && echo "None" || (date -r "$${ended%%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -d "@$${ended%%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null) || echo "$$ended"); \
echo "Status: $$status"; echo "Secondary: $$secondary"; echo "Started: $$started_fmt"; echo "Ended: $$ended_fmt"; echo "Failure: $$failure"
job-logs:
@if [ -z "$(JOB_NAME)" ]; then echo "Error: pass JOB_NAME=... (the SageMaker training job name)."; exit 1; fi; \
start=$$(($$(date +%s) - 7200)); start_ms=$$((start * 1000)); \
uv run aws logs filter-log-events --log-group-name /aws/sagemaker/TrainingJobs --log-stream-name-prefix "$(JOB_NAME)" --start-time $$start_ms --query 'events[*].message' --output text
sagemaker-launch:
@uri=$$(terraform -chdir=$(TF_DIR) output -raw ecr_image_uri 2>/dev/null); \
role=$$(terraform -chdir=$(TF_DIR) output -raw sagemaker_role_arn 2>/dev/null); \
bucket=$$(terraform -chdir=$(TF_DIR) output -raw s3_training_bucket 2>/dev/null); \
inst=$${INSTANCE_TYPE}; [ -z "$$inst" ] && inst=ml.g5.2xlarge; \
if [ -z "$$uri" ] || [ -z "$$role" ]; then echo "Run 'make tf-apply ENV=$(ENV)' first."; exit 1; fi; \
python $(SCRIPTS)/launch_sagemaker_job.py --image-uri $$uri --role $$role --config $(CONFIG) --instance-type "$$inst" --s3-bucket "$$bucket"
aws-pretrain:
@opts=""; \
[ "$(YES)" = "1" ] && opts="$$opts --yes"; \
[ "$(SKIP_TERRAFORM)" = "1" ] && opts="$$opts --skip-terraform"; \
[ "$(SKIP_PUSH)" = "1" ] && opts="$$opts --skip-push"; \
PATH="/usr/local/bin:/opt/homebrew/bin:$${PATH}" ENV=$(ENV) CONFIG=$(CONFIG) INSTANCE_TYPE=$${INSTANCE_TYPE} DOCKER=$${DOCKER} $(SCRIPTS)/run_aws_pretrain.sh $$opts
install-pre-commit:
uv sync --extra dev
uv run pre-commit install
pre-commit:
uv run pre-commit run --all-files