Fix the node_modules download hook.

Currently we use download_from_google_storage to download
the node_modules.tar.gz archive from GCS; however, the archive
contains a .bin/ directory containing a bunch of symlinks, and
download_from_google_storage won't let you decompress such an
archive.

In theory we could (should?) add a flag to download_from_google_storage
to support this, now that symlinks should work everywhere, but in
the meantime this CL replaces download_from_google_storage with a
custom download script that does the same thing but allows such
archive members.

Change-Id: I1467f95197aae7dee2c1914916e4caa0fd0a381d
Reviewed-on: https://chromium-review.googlesource.com/c/experimental/website/+/3246590
Reviewed-by: Struan Shrimpton <sshrimp@google.com>
Commit-Queue: Dirk Pranke <dpranke@google.com>
diff --git a/DEPS b/DEPS
index 2a6e0dd..abdd172 100644
--- a/DEPS
+++ b/DEPS
@@ -98,15 +98,10 @@
     ],
   },
   {
-    'name': 'node_modules',
+    'name': 'fetch_node_modules',
     'pattern': '.',
-    'action': [ 'vpython3',
-                'third_party/depot_tools/download_from_google_storage.py',
-                '--no_resume',
-                '--extract',
-                '--no_auth',
-                '--bucket', 'dpranke-chromium-website-exp-storage',
-                '-s', 'node_modules.tar.gz.sha1',
+    'action': [ 'python3',
+                'scripts/fetch_node_modules.py'
     ],
   },
   {
diff --git a/scripts/fetch_node_modules.py b/scripts/fetch_node_modules.py
new file mode 100755
index 0000000..a4a89de
--- /dev/null
+++ b/scripts/fetch_node_modules.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simple script to download the pinned Node modules from GCS.
+
+This script exists because the node_modules archive currently contains
+a node_modules/.bin directory with a bunch of symlinked files in it,
+and download_from_google_storage.py won't let you have archives with
+symlinks.
+
+In theory we should probably rebuild the node_modules distro without
+the ./bin directory (using `npm install --no-bin-lnks`) but that would
+cause the build scripts to fail and we'd have to replace `npmw` with
+something else.
+"""
+
+import argparse
+import hashlib
+import os
+import subprocess
+import sys
+import tarfile
+
+SRC_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.parse_args()
+
+  with open(os.path.join(SRC_ROOT, 'node_modules.tar.gz.sha1')) as fp:
+    expected_sha1 = fp.read().strip()
+
+  actual_sha1 = None
+  tgz = os.path.join(SRC_ROOT, 'node_modules.tar.gz')
+  if os.path.exists(tgz):
+    with open(tgz, 'rb') as fp:
+      s = hashlib.sha1()
+      s.update(fp.read())
+      actual_sha1 = s.hexdigest()
+
+  # TODO(dpranke): Consider whether we should validate that node_modules/
+  # and all of the expected files exist as well.
+  if actual_sha1 == expected_sha1:
+    return 0
+
+  retcode = subprocess.call([
+      sys.executable,
+      os.path.join(SRC_ROOT, 'third_party', 'depot_tools', 'gsutil.py'),
+      'cp',
+      'gs://dpranke-chromium-website-exp-storage/%s' % expected_sha1,
+      tgz
+  ])
+  if retcode:
+    return retcode
+
+  try:
+    # TODO(dpranke): download_from_google_storage puts in a fair amount
+    # of effort to not clobber an existing directory until it is sure it
+    # can extract the archive completely. Consider whether we should do
+    # the same.
+    with tarfile.open(tgz, 'r:gz') as tar:
+      tar.extractall(path=SRC_ROOT)
+    return 0
+  except Exception as e:
+    print(e)
+    return 1
+
+if __name__ == '__main__':
+  sys.exit(main())