Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.history/
26 changes: 15 additions & 11 deletions templates/Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
ENVIRONMENT ?= stage
PROJECT = <% .Name %>

apply: apply-remote-state apply-secrets apply-env apply-k8s-utils
apply: apply-remote-state apply-secrets apply-env apply-k8s-utils post-apply-setup

apply-remote-state:
aws s3 ls <% .Name %>-$(ENVIRONMENT)-terraform-state || (\
aws s3 ls $(PROJECT)-$(ENVIRONMENT)-terraform-state || (\
cd terraform/bootstrap/remote-state && \
terraform init && \
terraform apply -var "environment=$(ENVIRONMENT)" $(AUTO_APPROVE) && \
rm ./terraform.tfstate)

apply-secrets:
aws iam list-access-keys --user-name <% .Name %>-ci-user > /dev/null || (\
aws iam list-access-keys --user-name $(PROJECT)-ci-user > /dev/null || (\
cd terraform/bootstrap/secrets && \
terraform init && \
terraform apply $(AUTO_APPROVE) && \
Expand All @@ -27,24 +28,27 @@ apply-k8s-utils: update-k8s-conf
terraform apply $(AUTO_APPROVE)

update-k8s-conf:
aws eks --region <% index .Params `region` %> update-kubeconfig --name <% .Name %>-$(ENVIRONMENT)-<% index .Params `region` %>
aws eks --region <% index .Params `region` %> update-kubeconfig --name $(PROJECT)-$(ENVIRONMENT)-<% index .Params `region` %>

post-apply-setup:
cd scripts && ENVIRONMENT=$(ENVIRONMENT) PROJECT=$(PROJECT) sh post-apply.sh

teardown: teardown-k8s-utils teardown-env teardown-secrets teardown-remote-state

teardown-remote-state:
@echo "Deleting remote state is not reversible, are you sure you want to delete the resources? [y/N]:" && read ans && [ $${ans:-N} == y ] && \
export AWS_PAGER='' && export AWS_DEFAULT_REGION=<% index .Params `region` %> && \
aws s3 rm s3://<% .Name %>-$(ENVIRONMENT)-terraform-state --recursive && \
aws s3 rb s3://<% .Name %>-$(ENVIRONMENT)-terraform-state --force && \
aws dynamodb delete-table --region <% index .Params `region` %> --table-name <% .Name %>-$(ENVIRONMENT)-terraform-state-locks
aws s3 rm s3://$(PROJECT)-$(ENVIRONMENT)-terraform-state --recursive && \
aws s3 rb s3://$(PROJECT)-$(ENVIRONMENT)-terraform-state --force && \
aws dynamodb delete-table --region <% index .Params `region` %> --table-name $(PROJECT)-$(ENVIRONMENT)-terraform-state-locks

teardown-secrets:
@echo "Deleting secrets is not reversible, are you sure you want to delete the secrets? [y/N]:" && read ans && [ $${ans:-N} == y ] && \
export AWS_PAGER='' && export AWS_DEFAULT_REGION=<% index .Params `region` %> && \
aws secretsmanager list-secrets --region <% index .Params `region` %> --query "SecretList[?Tags[?Key=='project' && Value=='<% .Name %>']].[Name] | [0][0]" | xargs aws secretsmanager delete-secret --region <% index .Params `region` %> --secret-id && \
aws secretsmanager list-secrets --region <% index .Params `region` %> --query "SecretList[?Tags[?Key=='rds' && Value=='<% .Name %>-$(ENVIRONMENT)']].[Name] | [0][0]" | xargs aws secretsmanager delete-secret --region <% index .Params `region` %> --secret-id && \
aws iam delete-access-key --user-name <% .Name %>-ci-user --access-key-id $(shell aws iam list-access-keys --user-name <% .Name %>-ci-user --query "AccessKeyMetadata[0].AccessKeyId" | sed 's/"//g') && \
aws iam delete-user --user-name <% .Name %>-ci-user
aws secretsmanager list-secrets --region <% index .Params `region` %> --query "SecretList[?Tags[?Key=='project' && Value=='$(PROJECT)']].[Name] | [0][0]" | xargs aws secretsmanager delete-secret --region <% index .Params `region` %> --secret-id && \
aws secretsmanager list-secrets --region <% index .Params `region` %> --query "SecretList[?Tags[?Key=='rds' && Value=='$(PROJECT)-$(ENVIRONMENT)']].[Name] | [0][0]" | xargs aws secretsmanager delete-secret --region <% index .Params `region` %> --secret-id && \
aws iam delete-access-key --user-name $(PROJECT)-ci-user --access-key-id $(shell aws iam list-access-keys --user-name $(PROJECT)-ci-user --query "AccessKeyMetadata[0].AccessKeyId" | sed 's/"//g') && \
aws iam delete-user --user-name $(PROJECT)-ci-user

teardown-env:
cd terraform/environments/$(ENVIRONMENT) && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ data "aws_db_instance" "database" {

resource "kubernetes_namespace" "app_namespace" {
metadata {
name = "${var.project}"
name = var.project
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
locals {
log_format = <<EOF
{
"time_local": "$time_local",
"timestamp": "$time_iso8601",
"remote_addr": "$remote_addr",
"remote_user": "$remote_user",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,6 @@ resource "kubernetes_namespace" "logging" {
# depends_on = [kubernetes_namespace.logging]
# }

# # ExternalName service allowing us to refer to elasticsearch
# resource "kubernetes_service" "kibana_service" {
# metadata {
# name = "elasticsearch"
# namespace = "logging"
# }
# spec {
# type = "ExternalName"
# external_name = "es-eks.${var.internal_domain}"
# }
# depends_on = [kubernetes_namespace.logging]
# }


# # Create prometheus exporter to gather metrics about the elasticsearch cluster
# resource "helm_release" "elasticsearch_prometheus_exporter" {
Expand Down
66 changes: 66 additions & 0 deletions templates/scripts/elasticsearch-logging.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/sh
set -e
# This file wil run locally. It will run a run a job inside the kubernetes cluster that will set up indices and a lifecycle policy in Elasticsearch

DOCKER_IMAGE_TAG=commitdev/zero-k8s-utilities:0.0.2
ES_ENDPOINT=$(aws es describe-elasticsearch-domain --domain-name ${PROJECT}-${ENVIRONMENT}-logging --query "DomainStatus.Endpoints.vpc" | jq -r '.')

[ "${ES_ENDPOINT}" = "" ] && (echo "Unable to get elasticsearch cluster for domain '${PROJECT}-${ENVIRONMENT}-logging'"; return 1)

kubectl create namespace zero-setup 2>/dev/null || echo "Namespace exists"
kubectl create configmap setup-script -n zero-setup --from-file=files/elasticsearch-setup.sh 2>/dev/null || echo "Setup script exists"
kubectl create configmap index-policy -n zero-setup --from-file=files/elasticsearch-index-policy-stage.json --from-file=files/elasticsearch-index-policy-prod.json 2>/dev/null || echo "Index policy exists"

# Create a job
kubectl apply -f - <<EOF
apiVersion: batch/v1
kind: Job
metadata:
name: elasticsearch-setup
namespace: zero-setup
spec:
template:
spec:
containers:
- name: elasticsearch-setup
image: ${DOCKER_IMAGE_TAG}
command: ["sh"]
args: ["/elasticsearch-setup.sh"]
env:
- name: ES_ENDPOINT
value: ${ES_ENDPOINT}
- name: ENVIRONMENT
value: ${ENVIRONMENT}
volumeMounts:
- mountPath: /elasticsearch-setup.sh
name: setup-script
subPath: elasticsearch-setup.sh
- mountPath: /elasticsearch-index-policy-stage.json
name: index-policy
subPath: elasticsearch-index-policy-stage.json
- mountPath: /elasticsearch-index-policy-prod.json
name: index-policy
subPath: elasticsearch-index-policy-prod.json
volumes:
- name: setup-script
configMap:
name: setup-script
- name: index-policy
configMap:
name: index-policy
restartPolicy: Never
backoffLimit: 0
EOF

echo "Setting up Elasticsearch indices for log storage..."

# Delete the zero-setup namespace after the job is complete
kubectl -n zero-setup wait --for=condition=complete --timeout=20s job elasticsearch-setup
if [ $? -eq 0 ]
then
echo "Done. Writing elasticsearch setup logs to elasticsearch-setup.log"
kubectl logs --tail=-1 -n zero-setup -l job-name=elasticsearch-setup > ../elasticsearch-setup.log
kubectl delete namespace zero-setup
else
echo "Failed to execute elasticsearch setup, please see 'kubectl logs -n zero-setup -l job-name=elasticsearch-setup'"
fi
72 changes: 72 additions & 0 deletions templates/scripts/files/elasticsearch-index-policy-prod.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"policy": {
"description": "Keep 2 days of hot data, 4 weeks of warm data, 4 more weeks of cold data, then delete",
"default_state": "hot",
"states": [
{
"name": "hot",
"actions": [
{
"replica_count": {
"number_of_replicas": 2
}
}
],
"transitions": [
{
"state_name": "warm",
"conditions": {
"min_index_age": "2d"
}
}
]
},
{
"name": "warm",
"actions": [
{
"replica_count": {
"number_of_replicas": 2
},
"read_only": {}
}
],
"transitions": [
{
"state_name": "cold",
"conditions": {
"min_index_age": "30d"
}
}
]
},
{
"name": "cold",
"actions": [
{
"replica_count": {
"number_of_replicas": 1
}
}
],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "58d"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
54 changes: 54 additions & 0 deletions templates/scripts/files/elasticsearch-index-policy-stage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"policy": {
"description": "Keep 1 day of hot data, 1 month of cold data, then delete",
"default_state": "hot",
"states": [
{
"name": "hot",
"actions": [
{
"replica_count": {
"number_of_replicas": 1
}
}
],
"transitions": [
{
"state_name": "cold",
"conditions": {
"min_index_age": "1d"
}
}
]
},
{
"name": "cold",
"actions": [
{
"replica_count": {
"number_of_replicas": 1
},
"read_only": {}
}
],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "30d"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
24 changes: 24 additions & 0 deletions templates/scripts/files/elasticsearch-setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
set -e
# This file will be pushed into a configmap and executed inside a container in the kubernetes cluster.
# This will allow it to access Elasticsearch

echo "Executing Elasticsearch queries to configure the ${ENVIRONMENT} environment"

# Create the index pattern
curl -X POST "http://${ES_ENDPOINT}/_plugin/kibana/api/saved_objects/index-pattern" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' \
-d'{"attributes":{"title":"fluentd-*","timeFieldName":"@timestamp","fields":"[]"}}'

if [ "${ENVIRONMENT}" = "stage" ]; then
# Create the policy
curl -X PUT "http://${ES_ENDPOINT}/_opendistro/_ism/policies/hot_cold_delete_workflow?pretty" -H 'Content-Type: application/json' -d@/elasticsearch-index-policy-stage.json
# Make indices use the policy
curl -X PUT "http://${ES_ENDPOINT}/_template/fluentd_template?pretty" -H 'Content-Type: application/json' \
-d'{ "index_patterns": ["fluentd-*"], "settings": { "number_of_shards": 2, "number_of_replicas": 1, "opendistro.index_state_management.policy_id": "hot_cold_delete_workflow" }}'
else
# Create the policy
curl -X PUT "http://${ES_ENDPOINT}/_opendistro/_ism/policies/hot_warm_cold_delete_workflow?pretty" -H 'Content-Type: application/json' -d@/elasticsearch-index-policy-prod.json
# Make indices use the policy
curl -X PUT "http://${ES_ENDPOINT}/_template/fluentd_template?pretty" -H 'Content-Type: application/json' \
-d'{ "index_patterns": ["fluentd-*"], "settings": { "number_of_shards": 2, "number_of_replicas": 2, "opendistro.index_state_management.policy_id": "hot_warm_cold_delete_workflow" }}'
fi
4 changes: 4 additions & 0 deletions templates/scripts/post-apply.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
set -e

<% if ne (index .Params `loggingType`) "kibana" %># <% end %>source elasticsearch-logging.sh
36 changes: 36 additions & 0 deletions templates/terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,39 @@ The process should be:
- Do the drain/delete process with one node at a time. Wait for a new node to be available before running the process on a second one. This will prevent any traffic from being lost.

Done!

<% if eq (index .Params `loggingType`) "kibana" %>
## Kibana and Elasticsearch index Management

After creating the AWS Elasticsearch cluster to hold log data it’s a good idea to create index policies to control how data ages over time.

Typically you will want different policies on Staging and Production, as staging will probably have less restrictions about availability and speed, and more retained data increases cost.

You can view these in Kibana's Index Management UI by clicking on the "IM" tab, but some default indices and lifecycles are automatically created. You can see the policies that were created in [scripts/files/](../scripts/files/)
If you want to change these policies you can update the json files as necessary and then run `sh scripts/elasticsearch-logging.sh`

### Maintenance

Over the long term, policies like this should prevent indices from growing too big for the system to be able to store, but if the policies or amount of data per day change over time it may be necessary to investigate the state of the system to tweak some of these values.

The most likely limitations to hit will be size on disk and number of shards.

**Number of shards** will most likely stay at a stable amount, regardless of log volume, unless the policies are changed, as the policies control the number of indices that will be maintained, and each index has a set number of shards.

To see the current number of shards you can execute the stats query through the Kibana dev UI:

```
GET /_stats
result:
{ "_shards" : { "total" : 471, "successful" : 240, "failed" : 0 }, ...
}
```

The number of shards can’t exceed 1000 per node. If it reaches that limit, new indices can’t be created and log ingestion will stop until previous indices have been deleted.

**Size on disk** may fluctuate more than the number of shards because it is affected by the log volume. Old indices will be removed which will clear space every day, but it’s possible that the log volume will increase faster than the rate old logs are deleted, in which case the disk may fill up.

The best place to view this is the AWS console for Elasticsearch.

If the free space gets too low, the EBS volume can be resized by changing the value in Terraform, and it will be resized with no downtime.
<% end %>
2 changes: 1 addition & 1 deletion templates/terraform/environments/prod/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ module "prod" {
logging_type = "<% index .Params `loggingType` %>"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_version = "7.7"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_az_count = "2"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_type = "t2.medium.elasticsearch"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_type = "m5.large.elasticsearch"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_count = "2" # Must be a mulitple of the az count
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_volume_size_in_gb = "50" # Maximum value is limited by the instance type
# See https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-limits.html
Expand Down
3 changes: 2 additions & 1 deletion templates/terraform/environments/stage/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ module "stage" {
# Logging configuration
logging_type = "<% index .Params `loggingType` %>"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_version = "7.7"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_create_service_role = true # Set this to false if you need to create more than one ES cluster in an AWS account
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_az_count = "1"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_type = "t2.small.elasticsearch"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_type = "t2.medium.elasticsearch"
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_es_instance_count = "1" # Must be a mulitple of the az count
<% if ne (index .Params `loggingType`) "kibana" %># <% end %>logging_volume_size_in_gb = "10" # Maximum value is limited by the instance type
# See https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-limits.html
Expand Down
2 changes: 1 addition & 1 deletion templates/terraform/modules/environment/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,5 @@ module "logging" {
instance_type = var.logging_es_instance_type
instance_count = var.logging_es_instance_count
ebs_volume_size_in_gb = var.logging_volume_size_in_gb

create_service_role = var.logging_create_service_role
}
Loading