cat > iops_saturation.py << 'PYEOF'
#!/usr/bin/env python3
"""
IOPS Saturation Monitor
Scans EBS volumes and standard RDS instances to identify resources
that have sustained IOPS utilisation at or above a threshold percentage
of their capacity for longer than a specified duration.
IMPORTANT: Aurora is NOT monitored by this script because:
Aurora Architecture (Two Separate Storage Layers):
1. Local EBS Storage (temp files, NOT monitored by ReadIOPS/WriteIOPS):
- Used for: sorting, temp tables, index builds, query scratch space
- Monitored by: FreeLocalStorage CloudWatch metric
- Constrained by: Instance class EBS bandwidth/IOPS limits ✓
2. Aurora Cluster Storage (tables/indexes, monitored by ReadIOPS/WriteIOPS):
- Used for: All persistent database data (tables, indexes)
- Monitored by: ReadIOPS and WriteIOPS CloudWatch metrics
- Auto-scales: 10 GB to 128 TB, handles up to 256k IOPS
- NOT constrained by instance class EBS bandwidth/IOPS limits ✗
- SupportsIops=false, SupportsStorageThroughput=false, StorageType="aurora"
Since ReadIOPS/WriteIOPS (what users care about) measure Aurora CLUSTER storage,
and cluster storage has NO instance-level IOPS ceiling, there is nothing to monitor
against instance class IOPS limits.
NOTE: Aurora LOCAL storage (temp files) CAN be constrained by instance class EBS limits.
This could be monitored via Performance Insights wait events (IO:BufFileRead, IO:BufFileWrite),
but would require a different script using the Performance Insights API, not CloudWatch metrics.
Temp file I/O issues are typically solved by query tuning (work_mem, sort optimization),
not instance scaling, so this is not included in this script.
Aurora Serverless V2 Exclusion:
- Aurora Serverless V2 is excluded (no instance-level IOPS ceiling)
- Uses Aurora Capacity Units (ACUs): 0-256 ACUs (increments of 0.5)
- Aurora storage auto-scales (10 GB to 256 TiB) and I/O scales with workload demand
- Same distributed storage subsystem as Aurora provisioned clusters
- No fixed instance-level IOPS bottleneck exists to monitor
References:
- Aurora Serverless V2: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.html
- Aurora instance types: https://aws.amazon.com/rds/aurora/instance-types/
"The maximum EBS bandwidth refers to I/O bandwidth for local storage within
the DB instance. It doesn't apply to communication with the Aurora storage volume."
Metric selection and ceiling calculation are automatic per service type:
EBS VolumeReadOps + VolumeWriteOps (Count / 60s = IOPS)
Ceiling = min(provisioned_storage_iops, instance_type_iops_ceiling)
✓ Dual-ceiling problem: storage + instance
RDS standard ReadIOPS + WriteIOPS (rate metric, IOPS directly)
Ceiling = min(provisioned_storage_iops, instance_class_iops_ceiling)
✓ Dual-ceiling problem: storage + instance
✓ Engines: postgres, mysql, mariadb, oracle, sqlserver (NOT aurora)
Usage:
python iops_saturation.py --max-ops-pct 90 --max-ops-duration-secs 120 \
--ou-id ou-xxxx-xxxxxxxx --regions eu-west-1 us-east-1
python iops_saturation.py --max-ops-pct 95 --max-ops-duration-secs 300 \
--lookback-hours 48 --accounts 123456789012 --regions af-south-1
Required permissions on the assumed role:
cloudwatch:GetMetricStatistics
rds:DescribeDBInstances
ec2:DescribeVolumes
ec2:DescribeInstances
References:
- Blog: https://andrewbaker.ninja/2026/03/03/knowing-your-iops-are-broken-is-not-the-same-as-knowing-they-are-about-to-break/
- Aurora instance types: https://aws.amazon.com/rds/aurora/instance-types/
- RDS instance classes: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.DBInstanceClass.html
"""
# Bootstrap: install missing dependencies before any other imports
import subprocess
import sys
def _bootstrap():
required = {"boto3": "boto3", "pandas": "pandas", "openpyxl": "openpyxl"}
missing = []
for import_name, pkg_name in required.items():
try:
__import__(import_name)
except ImportError:
missing.append(pkg_name)
if missing:
print(f"[bootstrap] Installing missing packages: {', '.join(missing)}", flush=True)
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "--break-system-packages", "--quiet"] + missing,
stderr=subprocess.STDOUT,
)
_bootstrap()
import boto3
import csv
import argparse
import logging
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, asdict, field
from typing import Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
log = logging.getLogger(__name__)
EC2_IOPS_CEILING = {
"t3.nano": 2085, "t3.micro": 2085, "t3.small": 2085, "t3.medium": 2085,
"t3.large": 2085, "t3.xlarge": 2085, "t3.2xlarge": 2085,
"t3a.nano": 2085, "t3a.micro": 2085, "t3a.small": 2085, "t3a.medium": 2085,
"t3a.large": 2085, "t3a.xlarge": 2085, "t3a.2xlarge": 2085,
"t4g.nano": 2085, "t4g.micro": 2085, "t4g.small": 2085, "t4g.medium": 2085,
"t4g.large": 2085, "t4g.xlarge": 2085, "t4g.2xlarge": 2085,
"m5.large": 3600, "m5.xlarge": 6000, "m5.2xlarge": 8333,
"m5.4xlarge": 16667, "m5.8xlarge": 18750, "m5.12xlarge": 28750,
"m5.16xlarge": 37500, "m5.24xlarge": 40000, "m5.metal": 40000,
"m5a.large": 3600, "m5a.xlarge": 6000, "m5a.2xlarge": 8333,
"m5a.4xlarge": 16667, "m5a.8xlarge": 18750, "m5a.12xlarge": 28750,
"m5a.16xlarge": 37500, "m5a.24xlarge": 40000,
"m6g.medium": 3500, "m6g.large": 3500, "m6g.xlarge": 7000,
"m6g.2xlarge": 10000, "m6g.4xlarge": 20000, "m6g.8xlarge": 30000,
"m6g.12xlarge": 40000, "m6g.16xlarge": 40000, "m6g.metal": 40000,
"m6i.large": 6667, "m6i.xlarge": 10000, "m6i.2xlarge": 13333,
"m6i.4xlarge": 20000, "m6i.8xlarge": 26667, "m6i.12xlarge": 40000,
"m6i.16xlarge": 40000, "m6i.24xlarge": 40000, "m6i.32xlarge": 40000,
"r5.large": 3600, "r5.xlarge": 6000, "r5.2xlarge": 8333,
"r5.4xlarge": 16667, "r5.8xlarge": 18750, "r5.12xlarge": 28750,
"r5.16xlarge": 37500, "r5.24xlarge": 40000, "r5.metal": 40000,
"r6g.medium": 3500, "r6g.large": 3500, "r6g.xlarge": 7000,
"r6g.2xlarge": 10000, "r6g.4xlarge": 20000, "r6g.8xlarge": 30000,
"r6g.12xlarge": 40000, "r6g.16xlarge": 40000, "r6g.metal": 40000,
"r6i.large": 6667, "r6i.xlarge": 10000, "r6i.2xlarge": 13333,
"r6i.4xlarge": 20000, "r6i.8xlarge": 26667, "r6i.12xlarge": 40000,
"r6i.16xlarge": 40000, "r6i.24xlarge": 40000, "r6i.32xlarge": 40000,
"c5.large": 3600, "c5.xlarge": 6000, "c5.2xlarge": 8333,
"c5.4xlarge": 16667, "c5.9xlarge": 20000, "c5.12xlarge": 28750,
"c5.18xlarge": 37500, "c5.24xlarge": 40000, "c5.metal": 40000,
"c6g.medium": 3500, "c6g.large": 3500, "c6g.xlarge": 7000,
"c6g.2xlarge": 10000, "c6g.4xlarge": 20000, "c6g.8xlarge": 30000,
"c6g.12xlarge": 40000, "c6g.16xlarge": 40000, "c6g.metal": 40000,
"c6i.large": 6667, "c6i.xlarge": 10000, "c6i.2xlarge": 13333,
"c6i.4xlarge": 20000, "c6i.8xlarge": 26667, "c6i.12xlarge": 40000,
"c6i.16xlarge": 40000, "c6i.24xlarge": 40000, "c6i.32xlarge": 40000,
"i3.large": 3000, "i3.xlarge": 6000, "i3.2xlarge": 12000,
"i3.4xlarge": 16000, "i3.8xlarge": 32500, "i3.16xlarge": 65000,
"i3en.large": 4750, "i3en.xlarge": 9500, "i3en.2xlarge": 19000,
"i3en.3xlarge": 26125, "i3en.6xlarge": 52250, "i3en.12xlarge": 65000,
"i3en.24xlarge": 65000,
}
RDS_IOPS_CEILING = {
"db.t3.micro": 1536, "db.t3.small": 1536, "db.t3.medium": 1536,
"db.t3.large": 2048, "db.t3.xlarge": 2048, "db.t3.2xlarge": 2048,
"db.t4g.micro": 1700, "db.t4g.small": 1700, "db.t4g.medium": 1700,
"db.t4g.large": 2000, "db.t4g.xlarge": 2000, "db.t4g.2xlarge": 2000,
"db.m5.large": 3600, "db.m5.xlarge": 6000, "db.m5.2xlarge": 8333,
"db.m5.4xlarge": 16667, "db.m5.8xlarge": 18750, "db.m5.12xlarge": 28750,
"db.m5.16xlarge": 37500, "db.m5.24xlarge": 40000,
"db.m6g.large": 3500, "db.m6g.xlarge": 7000, "db.m6g.2xlarge": 10000,
"db.m6g.4xlarge": 20000, "db.m6g.8xlarge": 30000, "db.m6g.12xlarge": 40000,
"db.m6g.16xlarge": 40000,
"db.m6i.large": 6667, "db.m6i.xlarge": 10000, "db.m6i.2xlarge": 13333,
"db.m6i.4xlarge": 20000, "db.m6i.8xlarge": 26667, "db.m6i.12xlarge": 40000,
"db.m6i.16xlarge": 40000,
"db.r5.large": 3600, "db.r5.xlarge": 6000, "db.r5.2xlarge": 8333,
"db.r5.4xlarge": 16667, "db.r5.8xlarge": 18750, "db.r5.12xlarge": 28750,
"db.r5.16xlarge": 37500, "db.r5.24xlarge": 40000,
"db.r6g.large": 3500, "db.r6g.xlarge": 7000, "db.r6g.2xlarge": 10000,
"db.r6g.4xlarge": 20000, "db.r6g.8xlarge": 30000, "db.r6g.12xlarge": 40000,
"db.r6g.16xlarge": 40000,
"db.r6i.large": 6667, "db.r6i.xlarge": 10000, "db.r6i.2xlarge": 13333,
"db.r6i.4xlarge": 20000, "db.r6i.8xlarge": 26667, "db.r6i.12xlarge": 40000,
"db.r6i.16xlarge": 40000,
"db.x1e.xlarge": 3700, "db.x1e.2xlarge": 7400, "db.x1e.4xlarge": 14800,
"db.x1e.8xlarge": 29600, "db.x1e.16xlarge": 40000, "db.x1e.32xlarge": 40000,
"db.x2g.large": 3500, "db.x2g.xlarge": 7000, "db.x2g.2xlarge": 10000,
"db.x2g.4xlarge": 20000, "db.x2g.8xlarge": 30000, "db.x2g.12xlarge": 40000,
"db.x2g.16xlarge": 40000,
}
PERIOD_SECONDS = 60
@dataclass
class SaturationBreach:
account_id: str
account_name: str
region: str
service_type: str
resource_id: str
resource_name: str
instance_type: str
iops_ceiling: int # effective ceiling = min(storage, instance) for EBS/RDS
storage_iops_ceiling: int # provisioned storage IOPS
instance_iops_ceiling: int # instance class IOPS ceiling (0 = not applicable)
threshold_pct: float
threshold_iops: float
max_observed_iops: float
max_observed_pct: float
longest_breach_seconds: int
breach_start_utc: str
breach_end_utc: str
metric_used: str
note: str = ""
def get_metric_datapoints(cw_client, namespace, metric_name, dimensions, start_time, end_time, stat="Sum"):
resp = cw_client.get_metric_statistics(
Namespace=namespace,
MetricName=metric_name,
Dimensions=dimensions,
StartTime=start_time,
EndTime=end_time,
Period=PERIOD_SECONDS,
Statistics=[stat],
)
points = [(dp["Timestamp"], dp[stat]) for dp in resp.get("Datapoints", [])]
points.sort(key=lambda x: x[0])
return points
def find_sustained_breaches(combined_iops, threshold_iops, max_ops_duration_seconds, is_count_metric=False):
if not combined_iops:
return []
timestamps = sorted(combined_iops.keys())
breaches = []
run_start = None
run_end = None
run_max = 0.0
for ts in timestamps:
raw = combined_iops[ts]
iops = raw / PERIOD_SECONDS if is_count_metric else raw
if iops >= threshold_iops:
if run_start is None:
run_start = ts
run_end = ts
run_max = max(run_max, iops)
else:
if run_start is not None:
duration = (run_end - run_start).total_seconds() + PERIOD_SECONDS
if duration >= max_ops_duration_seconds:
breaches.append((run_start, run_end, run_max, duration))
run_start = None
run_end = None
run_max = 0.0
if run_start is not None:
duration = (run_end - run_start).total_seconds() + PERIOD_SECONDS
if duration >= max_ops_duration_seconds:
breaches.append((run_start, run_end, run_max, duration))
return breaches
def build_volume_instance_map(ec2_client):
"""
Returns a dict mapping volume_id -> (instance_id, instance_type)
for all in-use volumes in the region.
"""
vol_to_instance = {}
instance_types = {}
# Collect instance types first
inst_paginator = ec2_client.get_paginator("describe_instances")
for page in inst_paginator.paginate():
for reservation in page["Reservations"]:
for inst in reservation["Instances"]:
instance_types[inst["InstanceId"]] = inst.get("InstanceType", "unknown")
# Map volumes to instances
vol_paginator = ec2_client.get_paginator("describe_volumes")
for page in vol_paginator.paginate(
Filters=[{"Name": "status", "Values": ["in-use"]}]
):
for vol in page["Volumes"]:
for attachment in vol.get("Attachments", []):
iid = attachment.get("InstanceId")
if iid and iid in instance_types:
vol_to_instance[vol["VolumeId"]] = (iid, instance_types[iid])
break
return vol_to_instance
def audit_ebs(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours):
findings = []
ec2 = session.client("ec2", region_name=region)
cw = session.client("cloudwatch", region_name=region)
end_time = datetime.now(timezone.utc)
start_time = end_time - timedelta(hours=lookback_hours)
# Build volume -> instance type mapping once for the region
try:
vol_instance_map = build_volume_instance_map(ec2)
except Exception as e:
log.warning(f"Could not build volume/instance map in {account_id}/{region}: {e}")
vol_instance_map = {}
vol_paginator = ec2.get_paginator("describe_volumes")
for page in vol_paginator.paginate(
Filters=[
{"Name": "volume-type", "Values": ["io1", "io2", "gp3"]},
{"Name": "status", "Values": ["in-use"]},
]
):
for vol in page["Volumes"]:
provisioned_iops = vol.get("Iops", 0) or 0
if provisioned_iops == 0:
continue
vol_id = vol["VolumeId"]
tags = vol.get("Tags", [])
name = next((t["Value"] for t in tags if t["Key"] == "Name"), vol_id)
vol_type = vol.get("VolumeType", "unknown")
# Determine effective ceiling: min(storage IOPS, instance IOPS ceiling)
instance_type = "unknown"
instance_iops_ceiling = 0
ceiling_note = ""
if vol_id in vol_instance_map:
_, instance_type = vol_instance_map[vol_id]
instance_iops_ceiling = EC2_IOPS_CEILING.get(instance_type, 0)
if instance_iops_ceiling > 0:
effective_ceiling = min(provisioned_iops, instance_iops_ceiling)
binding = "storage" if provisioned_iops <= instance_iops_ceiling else "instance"
ceiling_note = (
f"Effective ceiling = min(storage: {provisioned_iops:,}, "
f"instance {instance_type}: {instance_iops_ceiling:,}) = {effective_ceiling:,} IOPS "
f"[{binding} ceiling is binding]"
)
else:
effective_ceiling = provisioned_iops
ceiling_note = (
f"Storage ceiling used ({provisioned_iops:,} IOPS); "
f"instance type {instance_type!r} not in lookup table"
)
threshold_iops = effective_ceiling * (max_ops_pct / 100.0)
try:
pts_read = get_metric_datapoints(cw, "AWS/EBS", "VolumeReadOps",
[{"Name": "VolumeId", "Value": vol_id}], start_time, end_time)
pts_write = get_metric_datapoints(cw, "AWS/EBS", "VolumeWriteOps",
[{"Name": "VolumeId", "Value": vol_id}], start_time, end_time)
combined = {}
for ts, val in pts_read:
combined[ts] = combined.get(ts, 0.0) + val
for ts, val in pts_write:
combined[ts] = combined.get(ts, 0.0) + val
breaches = find_sustained_breaches(combined, threshold_iops, max_ops_duration_seconds, is_count_metric=True)
except Exception as e:
log.warning(f"CloudWatch error for EBS {vol_id} in {account_id}/{region}: {e}")
continue
for breach_start, breach_end, breach_max_iops, breach_secs in breaches:
findings.append(SaturationBreach(
account_id=account_id, account_name=account_name, region=region,
service_type=f"EBS ({vol_type})", resource_id=vol_id, resource_name=name,
instance_type=instance_type,
iops_ceiling=effective_ceiling,
storage_iops_ceiling=provisioned_iops,
instance_iops_ceiling=instance_iops_ceiling,
threshold_pct=max_ops_pct, threshold_iops=round(threshold_iops, 1),
max_observed_iops=round(breach_max_iops, 1),
max_observed_pct=round((breach_max_iops / effective_ceiling) * 100, 1),
longest_breach_seconds=int(breach_secs),
breach_start_utc=breach_start.strftime("%Y-%m-%d %H:%M:%S UTC"),
breach_end_utc=breach_end.strftime("%Y-%m-%d %H:%M:%S UTC"),
metric_used="AWS/EBS: VolumeReadOps + VolumeWriteOps (Sum / 60s = IOPS)",
note=ceiling_note,
))
return findings
def audit_rds_standard(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours):
findings = []
rds = session.client("rds", region_name=region)
cw = session.client("cloudwatch", region_name=region)
end_time = datetime.now(timezone.utc)
start_time = end_time - timedelta(hours=lookback_hours)
paginator = rds.get_paginator("describe_db_instances")
for page in paginator.paginate():
for db in page["DBInstances"]:
engine = db.get("Engine", "")
if "aurora" in engine.lower():
continue
provisioned_iops = db.get("Iops", 0) or 0
if provisioned_iops == 0:
continue
status = db.get("DBInstanceStatus", "")
if status not in ("available", "backing-up", "modifying"):
continue
db_id = db.get("DBInstanceIdentifier", "")
instance_type = db.get("DBInstanceClass", "unknown")
tags = db.get("TagList", [])
name = next((t["Value"] for t in tags if t["Key"] == "Name"), db_id)
# Determine effective ceiling: min(storage IOPS, instance class IOPS ceiling)
instance_iops_ceiling = RDS_IOPS_CEILING.get(instance_type, 0)
if instance_iops_ceiling > 0:
effective_ceiling = min(provisioned_iops, instance_iops_ceiling)
binding = "storage" if provisioned_iops <= instance_iops_ceiling else "instance"
ceiling_note = (
f"Effective ceiling = min(storage: {provisioned_iops:,}, "
f"instance {instance_type}: {instance_iops_ceiling:,}) = {effective_ceiling:,} IOPS "
f"[{binding} ceiling is binding]"
)
else:
effective_ceiling = provisioned_iops
ceiling_note = (
f"Storage ceiling used ({provisioned_iops:,} IOPS); "
f"instance type {instance_type!r} not in lookup table"
)
threshold_iops = effective_ceiling * (max_ops_pct / 100.0)
dims = [{"Name": "DBInstanceIdentifier", "Value": db_id}]
try:
pts_read = get_metric_datapoints(cw, "AWS/RDS", "ReadIOPS", dims, start_time, end_time, stat="Average")
pts_write = get_metric_datapoints(cw, "AWS/RDS", "WriteIOPS", dims, start_time, end_time, stat="Average")
combined = {}
for ts, val in pts_read:
combined[ts] = combined.get(ts, 0.0) + val
for ts, val in pts_write:
combined[ts] = combined.get(ts, 0.0) + val
breaches = find_sustained_breaches(combined, threshold_iops, max_ops_duration_seconds, is_count_metric=False)
except Exception as e:
log.warning(f"CloudWatch error for RDS {db_id} in {account_id}/{region}: {e}")
continue
for breach_start, breach_end, breach_max_iops, breach_secs in breaches:
findings.append(SaturationBreach(
account_id=account_id, account_name=account_name, region=region,
service_type=f"RDS ({engine})", resource_id=db_id, resource_name=name,
instance_type=instance_type,
iops_ceiling=effective_ceiling,
storage_iops_ceiling=provisioned_iops,
instance_iops_ceiling=instance_iops_ceiling,
threshold_pct=max_ops_pct, threshold_iops=round(threshold_iops, 1),
max_observed_iops=round(breach_max_iops, 1),
max_observed_pct=round((breach_max_iops / effective_ceiling) * 100, 1),
longest_breach_seconds=int(breach_secs),
breach_start_utc=breach_start.strftime("%Y-%m-%d %H:%M:%S UTC"),
breach_end_utc=breach_end.strftime("%Y-%m-%d %H:%M:%S UTC"),
metric_used="AWS/RDS: ReadIOPS + WriteIOPS (Average)",
note=ceiling_note,
))
return findings
def audit_aurora(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours):
"""
Aurora monitoring is DISABLED in this script.
Aurora has TWO separate storage systems:
1. Local EBS Storage (temporary files, monitored by FreeLocalStorage):
- Used for sorting, temp tables, index builds
- Constrained by instance class EBS bandwidth/IOPS limits
- NOT what ReadIOPS/WriteIOPS metrics measure
2. Aurora Cluster Storage (tables/indexes, monitored by ReadIOPS/WriteIOPS):
- Managed by Aurora storage subsystem (NOT EBS)
- Auto-scales: Storage (10 GB to 128 TiB) and I/O scale with workload demand
- NOT constrained by instance class EBS bandwidth/IOPS limits
- SupportsIops: false (no provisioned IOPS to configure)
- SupportsStorageThroughput: false (no throughput to configure)
- StorageType: "aurora" (fixed, no gp3/io2 choice)
Since ReadIOPS/WriteIOPS (what users care about) measure Aurora cluster storage,
and cluster storage has no instance-level IOPS ceiling, there is nothing to monitor
against instance class IOPS limits.
AURORA SERVERLESS V2:
Aurora Serverless V2 is also excluded - no instance-level IOPS ceiling exists:
- Uses ACUs: 0-256 ACUs (increments of 0.5 ACUs)
- Same distributed Aurora cluster storage as provisioned Aurora
- Storage auto-scales (10 GB to 256 TiB) and I/O scales with workload demand
- No instance-level IOPS ceiling exists
References:
- Aurora Serverless V2: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.html
- "Aurora Serverless DB clusters use the same distributed storage subsystem as
Aurora provisioned DB clusters"
COULD WE MONITOR LOCAL STORAGE (TEMP FILES)?
Yes, but it would require a different approach:
- Data source: Performance Insights API (not CloudWatch)
- Wait events: IO:BufFileRead, IO:BufFileWrite
- Detection: High wait times + high temp file usage = potential EBS limit constraint
- Fix: Usually query tuning (work_mem), not instance scaling
- Not included in this script due to different data source and remediation approach
References:
- https://aws.amazon.com/rds/aurora/instance-types/
"The maximum EBS bandwidth refers to I/O bandwidth for local storage within
the DB instance. It doesn't apply to communication with the Aurora storage volume."
- https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/AuroraPostgreSQL.Managing.html#AuroraPostgreSQL.Managing.TempStorage
"Aurora PostgreSQL stores tables and indexes in the Aurora storage subsystem,
which is separate from the temporary storage."
- https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/PostgreSQL.ManagingTempFiles.Example.html
How to monitor temp file I/O using Performance Insights
"""
log.info(f" {account_id}/{region}: Aurora cluster storage monitoring skipped (no instance-level IOPS ceiling)")
log.info(f" {account_id}/{region}: Aurora local storage (temp files) would require Performance Insights API monitoring")
return []
def list_accounts_in_ou(ou_id):
org = boto3.client("organizations")
accounts = []
def recurse(parent_id):
for child_type in ("ACCOUNT", "ORGANIZATIONAL_UNIT"):
paginator = org.get_paginator("list_children")
for page in paginator.paginate(ParentId=parent_id, ChildType=child_type):
for child in page["Children"]:
if child_type == "ACCOUNT":
try:
resp = org.describe_account(AccountId=child["Id"])
acc = resp["Account"]
if acc["Status"] == "ACTIVE":
accounts.append({"id": acc["Id"], "name": acc["Name"]})
except Exception as e:
log.warning(f"Could not describe account {child['Id']}: {e}")
else:
recurse(child["Id"])
recurse(ou_id)
return accounts
def get_session(account_id, role_name):
if role_name:
sts = boto3.client("sts")
role_arn = f"arn:aws:iam::{account_id}:role/{role_name}"
creds = sts.assume_role(RoleArn=role_arn, RoleSessionName="IOPSSaturationScan")["Credentials"]
return boto3.Session(
aws_access_key_id=creds["AccessKeyId"],
aws_secret_access_key=creds["SecretAccessKey"],
aws_session_token=creds["SessionToken"],
)
return boto3.Session()
def audit_account(account, role_name, regions, max_ops_pct, max_ops_duration_seconds, lookback_hours):
account_id = account["id"]
account_name = account["name"]
all_findings = []
log.info(f"Auditing account {account_id} ({account_name})")
try:
session = get_session(account_id, role_name)
except Exception as e:
log.error(f"Cannot assume role in {account_id}: {e}")
return []
for region in regions:
log.info(f" {account_id} scanning {region}...")
try:
all_findings.extend(audit_ebs(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours))
all_findings.extend(audit_rds_standard(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours))
all_findings.extend(audit_aurora(session, account_id, account_name, region, max_ops_pct, max_ops_duration_seconds, lookback_hours))
except Exception as e:
log.error(f" Error in {account_id}/{region}: {e}")
return all_findings
def write_csv(findings, path):
if not findings:
return
fieldnames = list(asdict(findings[0]).keys())
with open(path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for finding in findings:
writer.writerow(asdict(finding))
log.info(f"CSV written: {path}")
def write_excel(findings, path):
if not PANDAS_AVAILABLE:
log.warning("pandas/openpyxl not installed -- skipping Excel output. pip install pandas openpyxl")
return
if not findings:
return
from openpyxl.styles import PatternFill
rows = [asdict(f) for f in findings]
df = pd.DataFrame(rows)
df = df.sort_values("max_observed_pct", ascending=False)
def row_colour(pct):
if pct >= 100:
return "FF2222"
elif pct >= 95:
return "FF6600"
elif pct >= 90:
return "FFB300"
return "90EE90"
with pd.ExcelWriter(path, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="IOPS Saturation Breaches")
ws = writer.sheets["IOPS Saturation Breaches"]
pct_col_idx = list(df.columns).index("max_observed_pct") + 1
for row_idx in range(2, len(df) + 2):
pct_val = ws.cell(row=row_idx, column=pct_col_idx).value or 0
colour = row_colour(pct_val)
for col_idx in range(1, len(df.columns) + 1):
ws.cell(row=row_idx, column=col_idx).fill = PatternFill(
start_color=colour, end_color=colour, fill_type="solid"
)
summary = df.groupby("service_type").agg(
breaches=("resource_id", "count"),
max_pct_observed=("max_observed_pct", "max"),
avg_breach_seconds=("longest_breach_seconds", "mean"),
).reset_index()
summary.to_excel(writer, index=False, sheet_name="Summary by Service")
log.info(f"Excel written: {path}")
def print_results(findings, max_ops_pct, max_ops_duration_seconds):
print()
print("=" * 70)
print("IOPS SATURATION BREACH REPORT")
print(f"Threshold : >= {max_ops_pct}% of effective IOPS ceiling")
print(f"Duration : >= {max_ops_duration_seconds}s sustained")
print("=" * 70)
if not findings:
print("\nNo sustained IOPS saturation breaches found.")
print("=" * 70)
return
findings_sorted = sorted(findings, key=lambda f: f.max_observed_pct, reverse=True)
by_type = {}
for f in findings_sorted:
by_type.setdefault(f.service_type, []).append(f)
for svc_type, items in sorted(by_type.items()):
print(f"\n {svc_type} ({len(items)} breach{'es' if len(items) != 1 else ''})")
print(f" {'Resource':<40} {'Ceiling':>8} {'Peak IOPS':>10} {'Peak %':>7} {'Duration':>10}")
print(f" {'=' * 40} {'=' * 8} {'=' * 10} {'=' * 7} {'=' * 10}")
for f in items:
print(f" {f.resource_name:<40} {f.iops_ceiling:>8,} {f.max_observed_iops:>10,.0f} {f.max_observed_pct:>6.1f}% {f.longest_breach_seconds:>8}s")
print(f" Account: {f.account_id} | Region: {f.region}")
print(f" Window: {f.breach_start_utc} to {f.breach_end_utc}")
if f.note:
print(f" Note: {f.note}")
print(f"\n Total breaches found: {len(findings)}")
print("=" * 70)
def parse_args():
parser = argparse.ArgumentParser(
description="Scan EBS volumes and standard RDS instances for sustained IOPS saturation. "
"Aurora (including Serverless V2) is excluded (no instance-level IOPS ceiling for cluster storage)."
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--ou-id", help="AWS Organizations OU ID")
group.add_argument("--accounts", nargs="+", help="Specific AWS account IDs")
parser.add_argument("--max-ops-pct", type=float, required=True,
help="Percentage of IOPS ceiling that constitutes a breach (e.g. 90)")
parser.add_argument("--max-ops-duration-secs", type=int, required=True,
help="Minimum sustained breach duration in seconds to report (e.g. 120)")
parser.add_argument("--lookback-hours", type=int, default=24,
help="Hours of CloudWatch history to examine (default: 24)")
parser.add_argument("--role-name", default="OrganizationAccountAccessRole")
parser.add_argument("--regions", nargs="+", default=["af-south-1"])
parser.add_argument("--workers", type=int, default=5)
parser.add_argument("--output-prefix", default="iops_saturation_report")
return parser.parse_args()
def main():
args = parse_args()
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
log.info(f"IOPS Saturation Scan starting")
log.info(f" Threshold : >= {args.max_ops_pct}% for >= {args.max_ops_duration_secs}s")
log.info(f" Lookback : {args.lookback_hours}h | Regions: {', '.join(args.regions)}")
accounts = list_accounts_in_ou(args.ou_id) if args.ou_id else [{"id": a, "name": a} for a in args.accounts]
if args.ou_id:
log.info(f" Found {len(accounts)} active accounts in OU {args.ou_id}")
all_findings = []
with ThreadPoolExecutor(max_workers=args.workers) as executor:
futures = {
executor.submit(audit_account, acc, args.role_name, args.regions,
args.max_ops_pct, args.max_ops_duration_secs, args.lookback_hours): acc
for acc in accounts
}
for future in as_completed(futures):
acc = futures[future]
try:
findings = future.result()
all_findings.extend(findings)
log.info(f"Account {acc['id']} complete: {len(findings)} breach(es)")
except Exception as e:
log.error(f"Account {acc['id']} failed: {e}")
print_results(all_findings, args.max_ops_pct, args.max_ops_duration_secs)
write_csv(all_findings, f"{args.output_prefix}_{timestamp}.csv")
write_excel(all_findings, f"{args.output_prefix}_{timestamp}.xlsx")
log.info("Scan complete.")
return 1 if any(f.max_observed_pct >= 100.0 for f in all_findings) else 0
if __name__ == "__main__":
sys.exit(main())
PYEOF
chmod +x iops_saturation.py
echo "iops_saturation.py written — run it directly, dependencies install automatically on first use"