cycler: Add scripts to help break down large storage buckets
BUG=b:322535016
Change-Id: Ia0cfdd36c80f4af6ac10b131b1be028272d0c52b
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/infra/go/+/5352205
Reviewed-by: George Engelbrecht <engeg@google.com>
Commit-Queue: Madeleine Hardt <hardtmad@google.com>
Tested-by: Madeleine Hardt <hardtmad@google.com>
diff --git a/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego b/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego
new file mode 100644
index 0000000..a806e67
--- /dev/null
+++ b/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego
@@ -0,0 +1,42 @@
+# The cycler executable will always load the package data.cycler
+package cycler
+
+# The global input should be a google storage ObjectAttrs struct.
+# ObjectAttrs represents the metadata for a Google Cloud Storage (GCS) object.
+# type ObjectAttrs struct {
+# Bucket string
+# Name string
+# ContentType string
+# ContentLanguage string
+# CacheControl string
+# EventBasedHold bool
+# TemporaryHold bool
+# RetentionExpirationTime time.Time
+# ACL []ACLRule
+# PredefinedACL string
+# Owner string
+# Size int64
+# ContentEncoding string
+# ContentDisposition string
+# MD5 []byte
+# CRC32C uint32
+# MediaLink string
+# Metadata map[string]string
+# Generation int64
+# Metageneration int64
+# StorageClass string
+# Created time.Time
+# Deleted time.Time
+# Updated time.Time
+# CustomerKeySHA256 string
+# KMSKeyName string
+# Prefix string
+# Etag string
+# }
+
+# "ageDays", is calculated on the fly from cycler and inserted into input,
+# which is why it isn't prefixed with .attr. like the rest of the fields.
+act := true {
+ re_match(".+?-snapshot/.*$", input.attr.Name)
+ input.ageDays < 38
+}
diff --git a/cmd/cycler/examples/snapshot_up_to_37_days_old.json b/cmd/cycler/examples/snapshot_up_to_37_days_old.json
new file mode 100644
index 0000000..7291bbe
--- /dev/null
+++ b/cmd/cycler/examples/snapshot_up_to_37_days_old.json
@@ -0,0 +1,34 @@
+{
+ "run_log_configuration": {
+ "destination_url": "gs://chromeos-throw-away-bucket/cycler-logs",
+ "chunk_size_bytes": 104857600,
+ "channel_size": 10000,
+ "persist_retries": 100,
+ "max_unpersisted_logs": 10
+ },
+
+ "policy_effect_configuration": {
+ "noop": { },
+ "policy_document_path": "examples/policies/snapshot_up_to_37_days_old.rego"
+ },
+
+ "stats_configuration": {
+ "prefix_report_max_depth": 2,
+ "age_days_histogram_options": {
+ "num_buckets": 16,
+ "growth_factor": 1.0,
+ "base_bucket_size": 1.0,
+ "min_value": 0
+ },
+ "size_bytes_histogram_options": {
+ "num_buckets": 16,
+ "growth_factor": 4.0,
+ "base_bucket_size": 1.0,
+ "min_value": 0
+ }
+ },
+
+ "mutation_allowed" : false,
+
+ "bucket": "chromeos-image-archive"
+}
diff --git a/cmd/cycler/tools/iterate_cycler.py b/cmd/cycler/tools/iterate_cycler.py
new file mode 100755
index 0000000..84e940c
--- /dev/null
+++ b/cmd/cycler/tools/iterate_cycler.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""A small script to use cycler to iterate through directories.
+
+Some of directories in chromeos-bot and chromeos-int bot are very large. It can
+help to list the subdirs and use this script to iterate through some of them.
+
+This script is not productionized. Please use with caution.
+Pairs well with upload_cycler_results.py
+"""
+
+# Ensure you're authed with gcloud auth application-default login.
+
+import subprocess
+
+BUCKET = 'YOUR-BUCKET'
+RUN_CONFIG_PATH = 'examples/YOUR-POLICY'
+DIR_LISTING_PATH = 'YOUR-SUBDIRS'
+
+with open(DIR_LISTING_PATH, 'r', encoding='utf-8') as f:
+ gs_dirs = f.readlines()
+
+for gs_dir in gs_dirs:
+ print(f'Evaluating: {gs_dir}')
+ gs_dir_name = gs_dir.split('/')[-2]
+
+ cycler_cmd = [
+ './cycler',
+ '-bucket',
+ BUCKET,
+ '-iUnderstandCyclerIsInEarlyDevelopment',
+ '-runConfigPath',
+ RUN_CONFIG_PATH,
+ '-prefixRoot',
+ gs_dir_name,
+ '-jsonOutFile',
+ f'cycler_results/{gs_dir_name}.json',
+ ]
+ cmd_str = ' '.join(cycler_cmd)
+ print(f'Running {cmd_str}')
+ result = subprocess.run(cycler_cmd,
+ check=True,
+ capture_output=True,
+ text=True)
+ for l in result.stderr.splitlines():
+ print(l)
diff --git a/cmd/cycler/tools/upload_cycler_results.py b/cmd/cycler/tools/upload_cycler_results.py
new file mode 100755
index 0000000..eae5c12
--- /dev/null
+++ b/cmd/cycler/tools/upload_cycler_results.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""A small script to upload cycler results to BQ.
+
+Takes JSON results from cycler, formats according to what BQ requires, and
+uploads to specified BQ table. Note that this script uses the particular
+schema for chromeos-bot:chromeos_bot_storage.storage.
+
+This script is not productionized. Please use with caution.
+Pairs well with iterate_cycler.py
+"""
+
+import datetime
+import os
+import json
+import subprocess
+
+BQ_TABLE_ID = 'YOUR-BQ-TABLE'
+RESULTS_DIR_PATH= 'YOUR-RESULTS-DIR'
+FORMATTED_RESULTS_DIR_PATH= 'YOUR-FORMATTED-RESULTS-DIR'
+GS_BUCKET= 'YOUR-GS-BUCKET' # Used in table so we can aggregate across buckets.
+
+for filename in os.listdir(RESULTS_DIR_PATH):
+ # Read and format results to be JSON KVs with newline delimiters per BQ.
+ file_path = os.path.join(RESULTS_DIR_PATH, filename)
+ formatted = []
+ with open(file_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ print(f"Formatting and uploading {file_path}")
+ for k,v in data["PrefixStats"]["PrefixMapSizeBytes"].items():
+ target = k.split('/')[0]
+ # Skip aggregated prefixes.
+ if target != k:
+ formatted.append({
+ "target": target,
+ "prefix": k,
+ "bytes": v,
+ # Include upload date for grouping periodic storage snapshots.
+ "date": datetime.datetime.today().strftime('%Y-%m-%d'),
+ "bucket": GS_BUCKET
+ })
+
+ # Ensure we have results and aren't trying to upload data for an empty dir.
+ if formatted:
+ formatted_file_path= os.path.join(FORMATTED_RESULTS_DIR_PATH, filename)
+ print(f"Writing out to {formatted_file_path}")
+ with open(formatted_file_path, 'w', encoding='utf-8') as f:
+ f.write("\n".join(json.dumps(obj) for obj in formatted))
+
+ # Upload row to BQ
+ cmd = [
+ 'bq',
+ 'load',
+ '--autodetect',
+ '--source_format=NEWLINE_DELIMITED_JSON',
+ BQ_TABLE_ID,
+ formatted_file_path
+ ]
+ cmd_str = ' '.join(cycler_cmd)
+ print(f'Running {cmd_str}')
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+ print(result.stdout)