From bae96c04318c466b0bb674fc4fd170724b6b074d Mon Sep 17 00:00:00 2001 From: Chris Cowley <1736762+chriscowley@users.noreply.github.com> Date: Fri, 18 Dec 2020 22:03:49 +0100 Subject: [PATCH] Add a section to the repo for architecture (#14944) * Proposal for monitoring responsibilities * added an architecture diagram for Prometheus * install graphviz * Only run the diagrams action when someone commits a diagram * Filled out the architecture README * Install node Prometheus Node Exporter on all hosts Co-authored-by: Chris Cowley --- .github/workflows/diagrams.yaml | 37 ++++++++++++ ansible/playbooks/role-rocky-monitoring.yml | 22 ++++++++ architecture/monitoring/README.md | 59 ++++++++++++++++++++ architecture/monitoring/prometheus_mvp.py | 62 +++++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 .github/workflows/diagrams.yaml create mode 100644 architecture/monitoring/README.md create mode 100755 architecture/monitoring/prometheus_mvp.py diff --git a/.github/workflows/diagrams.yaml b/.github/workflows/diagrams.yaml new file mode 100644 index 0000000..fed34b7 --- /dev/null +++ b/.github/workflows/diagrams.yaml @@ -0,0 +1,37 @@ +--- +name: Arcitecture Diagrams +on: + push: + paths: + - 'architecture/**.py' + +jobs: + build-archi-diagrams: + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - name: Git Checkout + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install graphviz + python -m pip install --upgrade pip + pip install diagrams + - name: Build diagrams + run: | + for file in $(find architecture/ -type f -name "*.py") + do + python $file + done + mkdir artifacts; mv *.png artifacts/ + - name: Upload diagram images + uses: actions/upload-artifact@v2 + with: + name: diagrams-png + path: artifacts/*.png diff --git a/ansible/playbooks/role-rocky-monitoring.yml b/ansible/playbooks/role-rocky-monitoring.yml index 933b723..218a9c1 100644 --- a/ansible/playbooks/role-rocky-monitoring.yml +++ b/ansible/playbooks/role-rocky-monitoring.yml @@ -21,6 +21,7 @@ role: - role: cloudalchemy.prometheus + - role: cloudalchemy.alertmanager post_tasks: - name: Touching run file that ansible has ran here @@ -30,3 +31,24 @@ mode: '0644' owner: root group: root + +- name: Install Prometheus Node Exporter + hosts: all + become: true + + pre_tasks: + - name: Install SELinux packages + package: + name: python3-policycoreutils.noarch + state: present + + roles: + - role: cloudalchemy.node-exporter + state: present + + post_tasks: + - name: Open firewall for node-exporter + ansible.posix.firewalld: + port: 9100/tcp + permanent: yes + state: enabled diff --git a/architecture/monitoring/README.md b/architecture/monitoring/README.md new file mode 100644 index 0000000..5fcf3c7 --- /dev/null +++ b/architecture/monitoring/README.md @@ -0,0 +1,59 @@ +# Monitoring + +For the now the the planned monitoring platform is [Prometheus](https://prometheus.io/). + +Initially, we should keep it simple. Prometheus can scale a long way and +allows a lot of clever stuff involving data archival and service discovery. +This can all come in the medium-term. + +For now we want to solve the basics: + +- collect infrastucture metrics +- visualise those over a reasonable time-frame +- be alerted if one of those metrics does somehthing funky + +For now we do not need HA, multi-year retention or automatic service discovery, +so I propose something like the following: + +- A single prometheus host in AWS + - Non-AWS Exporters added via Ansible using file_sd + - AWS hosted exporters added via ec2_sd +- Grafana on that host +- Alertmanager on that same host + - Non-critical alerts in a dedicated channel + - Critical alerts to a small group via a service like Pushover/Pagerduty. + +## Pretty pictures via Python + +Use [python-diagrams](https://diagrams.mingrammer.com) to build construct the diagram. + +``` +pip install --user diagrams +python ./prometheus-mvp.py +``` + +We'll automate putting the outputed file somewhere ASAP + +## What this is NOT addressing + +I am purposely not covering Logging and web service uptime here. We can check +web services with Prometheus, but an external service (UptimeRobot?) is, in my +opion, better suited to that problem. + +Likewise, I do not see Logging as directly related. A separate stack is +necessary for that. Loki would perhaps be a good solution that could +use the same Grafana instance. ELK and Graylog are also worth considering. + +## Responsiblities + +The monitoring team cannot realistically be responsible for how every single +is monitored. Prometheus has a huge library of exporters for almost everything. + +The monitoring team can be responsible for ensuring that the infrastructure is +available to the application/infrastructure teams. Also that knowledge of how +to be added to that infrastucture is suitably shared. + +It falls on the application teams themselves to find a suitable exporter, add +it to the Prometheus server and write the necessary alerts, queries and +dashboards. Obviously, we will help as much as we can, but please don't ask +me to learn the internals of FreeIPA for example. diff --git a/architecture/monitoring/prometheus_mvp.py b/architecture/monitoring/prometheus_mvp.py new file mode 100755 index 0000000..23fbdce --- /dev/null +++ b/architecture/monitoring/prometheus_mvp.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +from diagrams import Diagram, Cluster, Edge +from diagrams.aws.compute import EC2 +from diagrams.aws.general import General +from diagrams.aws.network import ELB +from diagrams.onprem.compute import Server +from diagrams.onprem.iac import Ansible +from diagrams.onprem.monitoring import Grafana, Prometheus +from diagrams.saas.alerting import Pushover +from diagrams.saas.chat import Slack + +graph_attr = { + } + +node_attr = { + } + +with Diagram("Prometheus MVP", + show=False, + direction="TB", + outformat="png", + graph_attr=graph_attr, + node_attr=node_attr): + + with Cluster("Rocky VPC"): + with Cluster("AWS services"): + aws_group = [ + EC2("service01"), + EC2("service02"), + ] + with Cluster("metrics host"): + metrics = Prometheus("metrics") + alertmanager = Prometheus("alertmanager") + dashboard = Grafana("monitoring") + metrics << dashboard + metrics >> alertmanager + + Ansible("ansible") >> metrics + metrics >> Edge(style="dashed", + label="ec2 read permissions") >> General("AWS API") + + alertmanager >> Edge(style="dashed", + label="non-critical") >> Slack("rocky-alerts") + alertmanager >> Edge(style="dashed", + label="critical") >> Pushover("tbd") + ELB("metrics.rockylinux.org") >> Edge(label="TCP3000") >> dashboard + with Cluster("Cloudvider"): + cloudvider_group = [ + Server("server01"), + Server("server02"), + ] + + with Cluster("Spry Servers"): + spry_group = [ + Server("server01"), + Server("server02"), + ] + + metrics >> aws_group + metrics >> spry_group + metrics >> cloudvider_group