Skip to content
Open
12 changes: 12 additions & 0 deletions pulumi/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
**psm_opts,
)

logdest_opts = resources.get('tb:cloudwatch:LogDestination', {})
logdests = {
logdest_name: tb_pulumi.cloudwatch.LogDestination(
f'{project.name_prefix}-logdest-{logdest_name}',
app_name=logdest_name,
project=project,
**logdest_config,
)
for logdest_name, logdest_config in logdest_opts.items()
}

# Build out some private network space
vpc_opts = resources['tb:network:MultiTierVpc']['vpc']
vpc = tb_pulumi.network.MultiTierVpc(
Expand Down Expand Up @@ -71,6 +82,7 @@ def __stalwart_cluster(jumphost_rules: list[dict]):
return stalwart.StalwartCluster(
f'{project.name_prefix}-stalwart',
project=project,
log_group_arn=logdests['stalwart'].resources['iam_policies']['write'].arn,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wires the instances to the new LogDestination write policy, but that policy lives in the upcoming tb_pulumi release. From the implementation I reviewed, it looks like logs:CreateLogStream / logs:PutLogEvents may be scoped only to the log-group ARN rather than the log-stream ARNs. If that is still true in the cut release, Fluent Bit will bootstrap successfully but get AccessDenied when it tries to write events. Could we double-check the released policy shape before merging?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this was a problem, but that was fixed by this PR, which is slated to go out with that release after it gets approved and merged.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just got that PR merged and have tested that code directly with success. When I go to prod with it, I'll double-check the policy that goes out there before shipping the fluent-bit configs to the live servers.

private_subnets=vpc.resources['private_subnets'],
public_subnets=vpc.resources['public_subnets'],
node_additional_ingress_rules=jumphost_rules,
Expand Down
6 changes: 6 additions & 0 deletions pulumi/bootstrap/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,19 @@
BOOTSTRAP_DIR = '/opt/stalwart-bootstrap'
BOOTSTRAP_LOG = '/var/log/stalwart-bootstrap.log'
INSTANCE_TAGS = {}

# Map of template files to target files
TEMPLATE_MAP = {
'fluent-bit.service.j2': '/usr/lib/systemd/system/fluent-bit.service',
'fluent-bit.yaml.j2': '/etc/fluent-bit/fluent-bit.yaml',
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add the new fluent-bit files to the list of things we template on a host.

'journald.conf.j2': '/etc/systemd/journald.conf',
'stalwart.toml.j2': '/opt/stalwart/etc/config.toml',
'thundermail.service.j2': '/usr/lib/systemd/system/thundermail.service',
}
# Map of template variable to EC2 tags
TEMPLATE_VALUE_TAG_MAP = {
'env': 'environment',
'function': 'postboot.stalwart.function',
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the new template variables based on instance tags.

'https_paths': 'postboot.stalwart.https_paths',
'node_services': 'postboot.stalwart.node_services',
'node_id': 'postboot.stalwart.node_id',
Expand Down
14 changes: 14 additions & 0 deletions pulumi/bootstrap/templates/fluent-bit.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=Fluent Bit
Documentation=https://docs.fluentbit.io/manual/
Requires=network.target
After=network.target

[Service]
Type=simple
Environment="ENV={{ env }}"
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{{ this }} is a Jinja variable, replaced in the bootstrapping process. This is how ENV=stage or whatever gets into the service environment.

ExecStart=/opt/fluent-bit/bin/fluent-bit -c /etc/fluent-bit/fluent-bit.yaml
Restart=always

[Install]
WantedBy=multi-user.target
41 changes: 41 additions & 0 deletions pulumi/bootstrap/templates/fluent-bit.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---

service:
flush: 1
grace: 5
daemon: no
dns.mode: UDP
hot_reload: on
log_level: info
storage.path: /fluent-bit/buffers
storage.backlog.flush_on_shutdown: on
storage.keep.rejected: on
storage.rejected.path: /fluent-bit/dlq

pipeline:
inputs:
- name: systemd
tag: cloudwatch.stalwart.{{ function }}
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tagging things with the function lets us catch them later and route them properly.

db: /opt/fluent-bit/thundermail.cursor
systemd_filter: _SYSTEMD_UNIT=thundermail.service
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without a persisted cursor (for example DB) or a read_from_tail guard on this systemd input, Fluent Bit will read the existing thundermail.service journal on first boot and can replay entries again after service restarts. That seems likely to backfill or duplicate logs in CloudWatch for these long-lived nodes. Is that intentional?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I'll revisit this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opted for the db option since it seems more likely to not miss messages.


filters: []

outputs:
# Send logs onward to CloudWatch. Log groups by the given name must pre-exist, and this service
# must have sufficient IAM permissions to post events to these log streams. If these log streams
# do not exist, this service must have permission to create them.
- name: cloudwatch_logs
match: cloudwatch.stalwart.mail
log_group_name: /tb/${ENV}/stalwart
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

${THIS} is not Jinja, but a syntax native to fluent-bit's configuration. This gets subbed out live by fluent-bit with the value of this environment variable.

log_stream_name: mail
region: eu-central-1
log_key: MESSAGE

- name: cloudwatch_logs
match: cloudwatch.stalwart.api
log_group_name: /tb/${ENV}/stalwart
log_stream_name: api
region: eu-central-1
log_key: MESSAGE

6 changes: 6 additions & 0 deletions pulumi/bootstrap/templates/journald.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[Journal]
{% if env == 'prod' %}
MaxRetentionSec=3day
{% else %}
MaxRetentionSec=7day
{% endif %}
25 changes: 18 additions & 7 deletions pulumi/config.dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ resources:
- stalwart.postboot.keycloak_backend
recovery_window_in_days: 0

tb:cloudwatch:LogDestination:
stalwart:
log_group:
retention_in_days: 7
log_streams:
api: api
mail: mail
org_name: tb

tb:network:MultiTierVpc:
vpc:
cidr_block: 10.2.0.0/16
Expand Down Expand Up @@ -43,18 +52,18 @@ resources:
additional_routes:
private:
- destination_cidr_block: 10.202.0.0/22 # observability-dev
vpc_peering_connection_id: pcx-0d2027442f0e54ca4
vpc_peering_connection_id: pcx-04d7e54008cd9326c
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The peering connection changed when I rebuilt dev for testing.

public: []
endpoint_interfaces:
- secretsmanager

# tb:ec2:SshableInstance: {}
tb:ec2:SshableInstance: {}
# Fill out this template to build an SSH bastion
tb:ec2:SshableInstance:
bastion:
ssh_keypair_name: mailstrom-dev
source_cidrs:
- 10.2.0.0/16 # Internal access
# tb:ec2:SshableInstance:
# bastion:
# ssh_keypair_name: mailstrom-dev
# source_cidrs:
# - 10.2.0.0/16 # Internal access

tb:mailstrom:StalwartCluster:
thundermail:
Expand All @@ -81,6 +90,7 @@ resources:
nodes:
"0": # Must be a unique, stringified integer
disable_api_termination: True
function: 'mail'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3.micro
Expand All @@ -99,6 +109,7 @@ resources:
storage_capacity: 20
"50":
disable_api_termination: True
function: 'api'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3.micro
Expand Down
12 changes: 12 additions & 0 deletions pulumi/config.prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ resources:
- stalwart.postboot.keycloak_backend
- stalwart.postboot.postgresql_backend

tb:cloudwatch:LogDestination:
stalwart:
log_group:
retention_in_days: 3
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shorter data retention in prod. This is actually the default value for the option, but I like to be explicit.

log_streams:
api: api
mail: mail
org_name: tb

tb:network:MultiTierVpc:
vpc:
cidr_block: 10.0.0.0/16
Expand Down Expand Up @@ -93,6 +102,7 @@ resources:
nodes:
"0": # Must be a unique, stringified integer
disable_api_termination: True
function: 'mail'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3a.large
Expand All @@ -110,6 +120,7 @@ resources:
storage_capacity: 20
"1": # Must be a unique, stringified integer
disable_api_termination: True
function: 'mail'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3a.large
Expand All @@ -127,6 +138,7 @@ resources:
storage_capacity: 20
"50":
disable_api_termination: True
function: 'api'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3.micro
Expand Down
11 changes: 11 additions & 0 deletions pulumi/config.stage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ resources:
- stalwart.postboot.postgresql_backend
- stalwart.postboot.keycloak_backend

tb:cloudwatch:LogDestination:
stalwart:
log_group:
retention_in_days: 7
log_streams:
api: api
mail: mail
org_name: tb

tb:network:MultiTierVpc:
vpc:
cidr_block: 10.1.0.0/16
Expand Down Expand Up @@ -108,6 +117,7 @@ resources:
# subnet: subnet-07ade1ed35462907d # eu-central-1a
"1":
disable_api_termination: True
function: 'mail'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3a.large
Expand All @@ -126,6 +136,7 @@ resources:
subnet: subnet-07712b990eb0d17c0 # eu-central-1b
"50":
disable_api_termination: True
function: 'api'
ignore_ami_changes: True
ignore_user_data_changes: True
instance_type: t3.micro
Expand Down
2 changes: 1 addition & 1 deletion pulumi/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Jinja2>=3.1,<4.0
pulumi_cloudflare==6.6.0
tb_pulumi @ git+https://github.com/thunderbird/pulumi.git@v0.0.16
tb_pulumi @ git+https://github.com/thunderbird/pulumi.git@v0.0.18
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a future version with a bunch of little fixes to the core library that we need here, which is why we need the release to be done before merging this.

toml>=0.10.2,<0.11
20 changes: 19 additions & 1 deletion pulumi/stalwart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def __init__(
self,
name: str,
project: tb_pulumi.ThunderbirdPulumiProject,
log_group_arn: str,
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new variable gets passed down to the IAM setup so we can be sure the node profile includes write access to the Stalwart log group.

private_subnets: list[aws.ec2.Subnet],
public_subnets: list[aws.ec2.Subnet],
https_features: list = [],
Expand Down Expand Up @@ -343,8 +344,16 @@ def __init__(
s3_bucket, s3_secret, s3_policy = stalwart_s3.s3(self=self)

# Build an IAM role with a policy to enable node bootstrapping
profile_policy, role, profile_postboot_attachment, profile_s3_attachment, profile = stalwart_iam.iam(
(
profile_policy,
role,
profile_postboot_attachment,
profile_s3_attachment,
profile_logwrite_attachment,
profile,
) = stalwart_iam.iam(
self,
log_group_arn=log_group_arn,
s3_policy=s3_policy,
)

Expand Down Expand Up @@ -463,6 +472,7 @@ def __init__(
'spam_filter_secret': config_secrets['spam_filter'],
'node_profile': profile,
'node_profile_policy': profile_policy,
'node_profile_logwrite_attachment': profile_logwrite_attachment,
'node_profile_postboot_policy_attachment': profile_postboot_attachment,
'node_profile_s3_policy_attachment': profile_s3_attachment,
'node_sgs': self.node_sgs,
Expand Down Expand Up @@ -671,6 +681,7 @@ def node(
depends_on: list = [],
disable_api_stop: bool = False,
disable_api_termination: bool = False,
function: str = 'unknown',
ignore_ami_changes: bool = True,
ignore_user_data_changes: bool = True,
instance_type: str = 't3.micro',
Expand All @@ -694,6 +705,9 @@ def node(
False.
:type disable_api_termination: bool, optional

:param function: This becomes the ``postboot.stalwart.function`` tag on the instance and the ``function``
variable inside of postboot templates.

:param ignore_ami_changes: When True, changes to the instance's AMI will not be applied. This prevents unwanted
rebuilding of cluster nodes, potentially causing downtime. Set to False if the AMI has changed and you
intend on rebuilding the node. Defaults to True.
Expand Down Expand Up @@ -749,6 +763,7 @@ def node(
postboot_tags = {
'postboot.stalwart.aws_region': self.project.aws_region,
'postboot.stalwart.env': self.project.stack,
'postboot.stalwart.function': function,
'postboot.stalwart.https_paths': ','.join(https_paths),
'postboot.stalwart.image': self.stalwart_image,
'postboot.stalwart.node_services': node_services_tag,
Expand Down Expand Up @@ -810,6 +825,9 @@ def user_data(self):
archive_file_base = './bootstrap'
archive_files = [
'bootstrap.py',
'templates/fluent-bit.service.j2',
'templates/fluent-bit.yaml.j2',
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These entries ensure that the new config templates wind up in the first-phase bootstrapping blob.

'templates/journald.conf.j2',
'templates/ports.j2',
'templates/stalwart.toml.j2',
'templates/thundermail.service.j2',
Expand Down
37 changes: 27 additions & 10 deletions pulumi/stalwart/iam.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import json
import pulumi_aws as aws

from tb_pulumi.constants import ASSUME_ROLE_POLICY, IAM_POLICY_DOCUMENT
from tb_pulumi.constants import ASSUME_ROLE_POLICY


def iam(
self,
log_group_arn: str,
s3_policy: aws.iam.Policy,
) -> tuple[
aws.iam.Policy, aws.iam.Role, aws.iam.RolePolicyAttachment, aws.iam.RolePolicyAttachment, aws.iam.InstanceProfile
Expand All @@ -32,14 +33,18 @@ def iam(
+ f':secret:mailstrom/{self.project.stack}/stalwart.postboot.*'
),
]
profile_postboot_policy_doc = IAM_POLICY_DOCUMENT.copy()
profile_postboot_policy_doc['Statement'][0].update(
{
'Sid': 'AllowPostbootSecretAccess',
'Action': ['secretsmanager:GetSecretValue'],
'Resource': bootstrap_secret_arns,
}
)
profile_postboot_policy_doc = {
'Version': '2012-10-17',
'Statement': [
{
'Sid': 'AllowPostbootSecretAccess',
'Effect': 'Allow',
'Action': ['secretsmanager:GetSecretValue'],
'Resource': bootstrap_secret_arns,
}
],
}

profile_policy = aws.iam.Policy(
f'{self.name}-policy-nodeprofile',
path='/',
Expand All @@ -64,7 +69,19 @@ def iam(
role=role.name,
policy_arn=s3_policy.arn,
)
profile_logwrite_attachment = aws.iam.RolePolicyAttachment(
f'{self.name}-rpa-nodeprofile-logs',
role=role.name,
policy_arn=log_group_arn,
)

profile = aws.iam.InstanceProfile(f'{self.name}-ip-nodeprofile', name=f'{self.name}-nodeprofile', role=role.name)

return profile_policy, role, profile_postboot_attachment, profile_s3_attachment, profile
return (
profile_policy,
role,
profile_postboot_attachment,
profile_s3_attachment,
profile_logwrite_attachment,
profile,
)
Loading
Loading