cycler: Add scripts to help break down large storage buckets BUG=b:322535016 Change-Id: Ia0cfdd36c80f4af6ac10b131b1be028272d0c52b Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/infra/go/+/5352205 Reviewed-by: George Engelbrecht <engeg@google.com> Commit-Queue: Madeleine Hardt <hardtmad@google.com> Tested-by: Madeleine Hardt <hardtmad@google.com>

commit: 65d85279ec977d0df258d589476539da9f71c164 [log] [tgz]
author: Madeleine Hardt <hardtmad@google.com> Fri Mar 08 17:34:26 2024
committer: Chromeos LUCI <chromeos-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri Mar 08 18:17:54 2024
tree: 487a35df833d2a9abcdabddfe4430d76a140784b
parent: 8f53fa831d621d5d9541f1921ed00d6bcaea49ae [diff]
diff --git a/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego b/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego
new file mode 100644
index 0000000..a806e67
--- /dev/null
+++ b/cmd/cycler/examples/policies/snapshot_up_to_37_days_old.rego

@@ -0,0 +1,42 @@
+# The cycler executable will always load the package data.cycler
+package cycler
+
+# The global input should be a google storage ObjectAttrs struct.
+# ObjectAttrs represents the metadata for a Google Cloud Storage (GCS) object.
+# type ObjectAttrs struct {
+#   Bucket string
+#   Name string
+#   ContentType string
+#   ContentLanguage string
+#   CacheControl string
+#   EventBasedHold bool
+#   TemporaryHold bool
+#   RetentionExpirationTime time.Time
+#   ACL []ACLRule
+#   PredefinedACL string
+#   Owner string
+#   Size int64
+#   ContentEncoding string
+#   ContentDisposition string
+#   MD5 []byte
+#   CRC32C uint32
+#   MediaLink string
+#   Metadata map[string]string
+#   Generation int64
+#   Metageneration int64
+#   StorageClass string
+#   Created time.Time
+#   Deleted time.Time
+#   Updated time.Time
+#   CustomerKeySHA256 string
+#   KMSKeyName string
+#   Prefix string
+#   Etag string
+# }
+
+# "ageDays", is calculated on the fly from cycler and inserted into input,
+# which is why it isn't prefixed with .attr. like the rest of the fields.
+act := true {
+    re_match(".+?-snapshot/.*$", input.attr.Name)
+    input.ageDays < 38
+}

diff --git a/cmd/cycler/examples/snapshot_up_to_37_days_old.json b/cmd/cycler/examples/snapshot_up_to_37_days_old.json
new file mode 100644
index 0000000..7291bbe
--- /dev/null
+++ b/cmd/cycler/examples/snapshot_up_to_37_days_old.json

@@ -0,0 +1,34 @@
+{
+    "run_log_configuration": {
+        "destination_url": "gs://chromeos-throw-away-bucket/cycler-logs",
+        "chunk_size_bytes": 104857600,
+        "channel_size": 10000,
+        "persist_retries": 100,
+        "max_unpersisted_logs": 10
+    },
+
+    "policy_effect_configuration": {
+        "noop": { },
+        "policy_document_path": "examples/policies/snapshot_up_to_37_days_old.rego"
+    },
+
+    "stats_configuration": {
+        "prefix_report_max_depth": 2,
+        "age_days_histogram_options": {
+            "num_buckets": 16,
+            "growth_factor": 1.0,
+            "base_bucket_size": 1.0,
+            "min_value": 0
+        },
+        "size_bytes_histogram_options": {
+            "num_buckets": 16,
+            "growth_factor": 4.0,
+            "base_bucket_size": 1.0,
+            "min_value": 0
+        }
+    },
+
+    "mutation_allowed" : false,
+
+    "bucket": "chromeos-image-archive"
+}

diff --git a/cmd/cycler/tools/iterate_cycler.py b/cmd/cycler/tools/iterate_cycler.py
new file mode 100755
index 0000000..84e940c
--- /dev/null
+++ b/cmd/cycler/tools/iterate_cycler.py

@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""A small script to use cycler to iterate through directories.
+
+Some of directories in chromeos-bot and chromeos-int bot are very large. It can
+help to list the subdirs and use this script to iterate through some of them.
+
+This script is not productionized. Please use with caution.
+Pairs well with upload_cycler_results.py
+"""
+
+# Ensure you're authed with gcloud auth application-default login.
+
+import subprocess
+
+BUCKET = 'YOUR-BUCKET'
+RUN_CONFIG_PATH = 'examples/YOUR-POLICY'
+DIR_LISTING_PATH = 'YOUR-SUBDIRS'
+
+with open(DIR_LISTING_PATH, 'r', encoding='utf-8') as f:
+    gs_dirs = f.readlines()
+
+for gs_dir in gs_dirs:
+    print(f'Evaluating: {gs_dir}')
+    gs_dir_name = gs_dir.split('/')[-2]
+
+    cycler_cmd = [
+        './cycler',
+        '-bucket',
+        BUCKET,
+        '-iUnderstandCyclerIsInEarlyDevelopment',
+        '-runConfigPath',
+        RUN_CONFIG_PATH,
+        '-prefixRoot',
+        gs_dir_name,
+        '-jsonOutFile',
+        f'cycler_results/{gs_dir_name}.json',
+    ]
+    cmd_str = ' '.join(cycler_cmd)
+    print(f'Running {cmd_str}')
+    result = subprocess.run(cycler_cmd,
+                            check=True,
+                            capture_output=True,
+                            text=True)
+    for l in result.stderr.splitlines():
+        print(l)

diff --git a/cmd/cycler/tools/upload_cycler_results.py b/cmd/cycler/tools/upload_cycler_results.py
new file mode 100755
index 0000000..eae5c12
--- /dev/null
+++ b/cmd/cycler/tools/upload_cycler_results.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""A small script to upload cycler results to BQ.
+
+Takes JSON results from cycler, formats according to what BQ requires, and
+uploads to specified BQ table. Note that this script uses the particular
+schema for chromeos-bot:chromeos_bot_storage.storage.
+
+This script is not productionized. Please use with caution.
+Pairs well with iterate_cycler.py
+"""
+
+import datetime
+import os
+import json
+import subprocess
+
+BQ_TABLE_ID = 'YOUR-BQ-TABLE'
+RESULTS_DIR_PATH= 'YOUR-RESULTS-DIR'
+FORMATTED_RESULTS_DIR_PATH= 'YOUR-FORMATTED-RESULTS-DIR'
+GS_BUCKET= 'YOUR-GS-BUCKET'  # Used in table so we can aggregate across buckets.
+
+for filename in os.listdir(RESULTS_DIR_PATH):
+    # Read and format results to be JSON KVs with newline delimiters per BQ.
+    file_path = os.path.join(RESULTS_DIR_PATH, filename)
+    formatted = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+        print(f"Formatting and uploading {file_path}")
+        for k,v in data["PrefixStats"]["PrefixMapSizeBytes"].items():
+            target = k.split('/')[0]
+            # Skip aggregated prefixes.
+            if target != k:
+                formatted.append({
+                  "target": target,
+                  "prefix": k,
+                  "bytes": v,
+                  # Include upload date for grouping periodic storage snapshots.
+                  "date": datetime.datetime.today().strftime('%Y-%m-%d'),
+                  "bucket": GS_BUCKET
+                })
+
+    # Ensure we have results and aren't trying to upload data for an empty dir.
+    if formatted:
+        formatted_file_path= os.path.join(FORMATTED_RESULTS_DIR_PATH, filename)
+        print(f"Writing out to {formatted_file_path}")
+        with open(formatted_file_path, 'w', encoding='utf-8') as f:
+            f.write("\n".join(json.dumps(obj) for obj in formatted))
+
+        # Upload row to BQ
+        cmd = [
+            'bq',
+            'load',
+            '--autodetect',
+            '--source_format=NEWLINE_DELIMITED_JSON',
+            BQ_TABLE_ID,
+            formatted_file_path
+        ]
+        cmd_str = ' '.join(cycler_cmd)
+        print(f'Running {cmd_str}')
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print(result.stdout)
commit	65d85279ec977d0df258d589476539da9f71c164	[log] [tgz]
author	Madeleine Hardt <hardtmad@google.com>	Fri Mar 08 17:34:26 2024
committer	Chromeos LUCI <chromeos-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri Mar 08 18:17:54 2024
tree	487a35df833d2a9abcdabddfe4430d76a140784b
parent	8f53fa831d621d5d9541f1921ed00d6bcaea49ae [diff]