[ninjalog] fully anonymize build stats before sending

We cannot send PII data.

This CL tried to remove following data,
* build config that can contain build directory path
* build directory path
* hostname
* cwd

I use per build uuid instead of per user uuid.


Bug: 900161
Change-Id: Id533762749806ad8616b7dc07f21b27dfe632c9a
Reviewed-on: https://chromium-review.googlesource.com/c/1369473
Reviewed-by: Shinya Kawanaka <shinyak@chromium.org>
Reviewed-by: Fumitoshi Ukai <ukai@chromium.org>
Commit-Queue: Takuto Ikuta <tikuta@chromium.org>
diff --git a/ninjalog_uploader.py b/ninjalog_uploader.py
index 255d6eb..843f106 100755
--- a/ninjalog_uploader.py
+++ b/ninjalog_uploader.py
@@ -29,6 +29,14 @@
 
 from third_party import httplib2
 
+# These build configs affect build performance a lot.
+# TODO(tikuta): Add 'blink_symbol_level', 'closure_compile' and
+#               'use_jumbo_build'.
+WHITELISTED_CONFIGS = (
+    'symbol_level', 'use_goma', 'is_debug', 'is_component_build', 'enable_nacl',
+    'host_os', 'host_cpu', 'target_os', 'target_cpu'
+)
+
 def IsGoogler(server):
     """Check whether this script run inside corp network."""
     try:
@@ -42,8 +50,16 @@
     """Parse gn_args as json and return config dictionary."""
     configs = json.loads(gn_args)
     build_configs = {}
+
     for config in configs:
-        build_configs[config["name"]] = config["current"]["value"]
+        key = config["name"]
+        if key not in WHITELISTED_CONFIGS:
+            continue
+        if 'current' in config:
+            build_configs[key] = config['current']['value']
+        else:
+            build_configs[key] = config['default']['value']
+
     return build_configs
 
 def GetBuildTargetFromCommandLine(cmdline):
@@ -74,17 +90,34 @@
 
     return targets
 
+def GetJflag(cmdline):
+    """Parse cmdline to get flag value for -j"""
+
+    for i in range(len(cmdline)):
+        if (cmdline[i] == '-j' and i + 1 < len(cmdline) and
+            cmdline[i+1].isdigit()):
+            return int(cmdline[i+1])
+
+        if (cmdline[i].startswith('-j') and
+            cmdline[i][len('-j'):].isdigit()):
+            return int(cmdline[i][len('-j'):])
+
 
 def GetMetadata(cmdline, ninjalog):
-    """Get metadata for uploaded ninjalog."""
+    """Get metadata for uploaded ninjalog.
+
+    Returned metadata has schema defined in
+    https://cs.chromium.org?q="type+Metadata+struct+%7B"+file:%5Einfra/go/src/infra/appengine/chromium_build_stats/ninjalog/
+
+    TODO(tikuta): Collect GOMA_* env var.
+    """
 
     build_dir = os.path.dirname(ninjalog)
 
     build_configs = {}
 
     try:
-        args = ['gn', 'args', build_dir, '--list', '--overrides-only',
-                '--short', '--json']
+        args = ['gn', 'args', build_dir, '--list', '--short', '--json']
         if sys.platform == 'win32':
             # gn in PATH is bat file in windows environment (except cygwin).
             args = ['cmd', '/c'] + args
@@ -101,13 +134,15 @@
 
     metadata = {
         'platform': platform.system(),
-        'cwd': build_dir,
-        'hostname': socket.gethostname(),
         'cpu_core': multiprocessing.cpu_count(),
-        'cmdline': cmdline,
         'build_configs': build_configs,
+        'targets': GetBuildTargetFromCommandLine(cmdline),
     }
 
+    jflag = GetJflag(cmdline)
+    if jflag is not None:
+        metadata['jobs'] = jflag
+
     return metadata
 
 def GetNinjalog(cmdline):
@@ -165,7 +200,7 @@
             g.write('# end of ninja log\n')
 
             metadata = GetMetadata(args.cmdline, ninjalog)
-            logging.info('send metadata: %s', metadata)
+            logging.info('send metadata: %s', json.dumps(metadata))
             g.write(json.dumps(metadata))
 
     h = httplib2.Http()
diff --git a/ninjalog_uploader_wrapper.py b/ninjalog_uploader_wrapper.py
index 9735201..406aa00 100755
--- a/ninjalog_uploader_wrapper.py
+++ b/ninjalog_uploader_wrapper.py
@@ -15,7 +15,7 @@
 THIS_DIR = os.path.dirname(__file__)
 UPLOADER = os.path.join(THIS_DIR, 'ninjalog_uploader.py')
 CONFIG = os.path.join(THIS_DIR, 'ninjalog.cfg')
-VERSION = 1
+VERSION = 2
 
 
 def LoadConfig():
@@ -40,17 +40,19 @@
 
 
 def ShowMessage(countdown):
+    whitelisted = '\n'.join(['  * %s' % config for config in
+                             ninjalog_uploader.WHITELISTED_CONFIGS])
     print """
 Your ninjalog will be uploaded to build stats server. The uploaded log will be
 used to analyze user side build performance.
 
 The following information will be uploaded with ninjalog.
 * OS (e.g. Win, Mac or Linux)
-* build directory (e.g. /home/foo/chromium/src/out/Release)
-* hostname
 * number of cpu cores of building machine
-* cmdline passed to ninja (e.g. ninja -C out/Default -j1024 chrome)
-* build config (e.g. use_goma=true, is_component_build=true, etc)
+* build targets (e.g. chrome, browser_tests)
+* parallelism passed by -j flag
+* following build configs
+%s
 
 Uploading ninjalog will be started after you run autoninja another %d time.
 
@@ -66,7 +68,7 @@
 You can find a more detailed explanation in
 %s
 
-""" % (countdown, __file__, __file__,
+""" % (whitelisted, countdown, __file__, __file__,
        os.path.abspath(os.path.join(THIS_DIR, "ninjalog.README.md")))
 
 
diff --git a/tests/ninjalog_uploader_test.py b/tests/ninjalog_uploader_test.py
index a80ae08..7716794 100755
--- a/tests/ninjalog_uploader_test.py
+++ b/tests/ninjalog_uploader_test.py
@@ -25,7 +25,14 @@
                 'default': {'value': 'false'},
                 'name': 'is_component_build'
             },
-        ])), {'is_component_build': 'true'})
+            {
+                'default': {'value': '"x64"'},
+                'name': 'host_cpu'
+            },
+        ])), {
+            'is_component_build': 'true',
+            'host_cpu': '"x64"',
+        })
 
         self.assertEqual(ninjalog_uploader.ParseGNArgs(json.dumps([
             {
@@ -85,6 +92,28 @@
         self.assertEqual(ninjalog_uploader.GetBuildTargetFromCommandLine(
             ['ninja', '-C', 'out/Release', 'chrome', 'all']), ['chrome', 'all'])
 
+    def test_get_j_flag(self):
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja']), None)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-j', '1000']), 1000)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-j', '1000a']), None)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-j', 'a']), None)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-j1000']), 1000)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-ja']), None)
+
+        self.assertEqual(ninjalog_uploader.GetJflag(
+            ['ninja','-j']), None)
+
 
 if __name__ == '__main__':
     unittest.main()