[analyzer] Support for naive cross translation unit analysis

The aim of this patch is to be minimal to enable incremental development of
the feature on the top of the tree. This patch should be an NFC when the
feature is turned off. It is turned off by default and still considered as
experimental.

Technical details are available in the EuroLLVM Talk: 
http://llvm.org/devmtg/2017-03//2017/02/20/accepted-sessions.html#7

Note that the initial prototype was done by A. Sidorin et al.: http://lists.llvm.org/pipermail/cfe-dev/2015-October/045730.html

Contributions to the measurements and the new version of the code: Peter Szecsi, Zoltan Gera, Daniel Krupp, Kareem Khazem.

Differential Revision: https://reviews.llvm.org/D30691


Cr-Mirrored-From: https://chromium.googlesource.com/external/github.com/llvm-mirror/clang
Cr-Mirrored-Commit: 5b8b6afcd1b48d3de840874c45f7543c0d40aa64
diff --git a/README.md b/README.md
index 54bfc37..1b6fc48 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,32 @@
 Use `--help` to know more about the commands.
 
 
+How to use the experimental Cross Translation Unit analysis
+-----------------------------------------------------------
+
+To run the CTU analysis, a compilation database file has to be created:
+
+    $ intercept-build <your build command>
+
+To run the Clang Static Analyzer against a compilation database
+with CTU analysis enabled, execute:
+    
+    $ analyze-build --ctu
+
+For CTU analysis an additional (function-definition) collection-phase is required. 
+For debugging purposes, it is possible to separately execute the collection 
+and the analysis phase. By doing this, the intermediate files used for 
+the analysis are kept on the disk in `./ctu-dir`.
+    
+    # Collect and store the data required by the CTU analysis
+    $ analyze-build --ctu-collect-only
+    
+    # Analyze using the previously collected data
+    $ analyze-build --ctu-analyze-only
+
+Use `--help` to get more information about the commands.
+
+
 Limitations
 -----------
 
diff --git a/libscanbuild/__init__.py b/libscanbuild/__init__.py
index 800926e..e7b7487 100644
--- a/libscanbuild/__init__.py
+++ b/libscanbuild/__init__.py
@@ -19,6 +19,9 @@
 
 Execution = collections.namedtuple('Execution', ['pid', 'cwd', 'cmd'])
 
+CtuConfig = collections.namedtuple('CtuConfig', ['collect', 'analyze', 'dir',
+                                                 'func_map_cmd'])
+
 
 def duplicate_check(method):
     """ Predicate to detect duplicated entries.
diff --git a/libscanbuild/analyze.py b/libscanbuild/analyze.py
index 6c129bd..3c93b11 100644
--- a/libscanbuild/analyze.py
+++ b/libscanbuild/analyze.py
@@ -22,16 +22,19 @@
 import subprocess
 import contextlib
 import datetime
+import shutil
+import glob
+from collections import defaultdict
 
 from libscanbuild import command_entry_point, compiler_wrapper, \
-    wrapper_environment, run_build, run_command
+    wrapper_environment, run_build, run_command, CtuConfig
 from libscanbuild.arguments import parse_args_for_scan_build, \
     parse_args_for_analyze_build
 from libscanbuild.intercept import capture
 from libscanbuild.report import document
 from libscanbuild.compilation import split_command, classify_source, \
     compiler_language
-from libscanbuild.clang import get_version, get_arguments
+from libscanbuild.clang import get_version, get_arguments, get_triple_arch
 from libscanbuild.shell import decode
 
 __all__ = ['scan_build', 'analyze_build', 'analyze_compiler_wrapper']
@@ -39,6 +42,9 @@
 COMPILER_WRAPPER_CC = 'analyze-cc'
 COMPILER_WRAPPER_CXX = 'analyze-c++'
 
+CTU_FUNCTION_MAP_FILENAME = 'externalFnMap.txt'
+CTU_TEMP_FNMAP_FOLDER = 'tmpExternalFnMaps'
+
 
 @command_entry_point
 def scan_build():
@@ -56,7 +62,7 @@
             exit_code = capture(args)
             # Run the analyzer against the captured commands.
             if need_analyzer(args.build):
-                run_analyzer_parallel(args)
+                govern_analyzer_runs(args)
         else:
             # Run build command and analyzer with compiler wrappers.
             environment = setup_environment(args)
@@ -75,7 +81,7 @@
     # will re-assign the report directory as new output
     with report_directory(args.output, args.keep_empty) as args.output:
         # Run the analyzer against a compilation db.
-        run_analyzer_parallel(args)
+        govern_analyzer_runs(args)
         # Cover report generation and bug counting.
         number_of_bugs = document(args)
         # Set exit status as it was requested.
@@ -95,6 +101,108 @@
     return len(args) and not re.search('configure|autogen', args[0])
 
 
+def prefix_with(constant, pieces):
+    """ From a sequence create another sequence where every second element
+    is from the original sequence and the odd elements are the prefix.
+
+    eg.: prefix_with(0, [1,2,3]) creates [0, 1, 0, 2, 0, 3] """
+
+    return [elem for piece in pieces for elem in [constant, piece]]
+
+
+def get_ctu_config_from_args(args):
+    """ CTU configuration is created from the chosen phases and dir. """
+
+    return (
+        CtuConfig(collect=args.ctu_phases.collect,
+                  analyze=args.ctu_phases.analyze,
+                  dir=args.ctu_dir,
+                  func_map_cmd=args.func_map_cmd)
+        if hasattr(args, 'ctu_phases') and hasattr(args.ctu_phases, 'dir')
+        else CtuConfig(collect=False, analyze=False, dir='', func_map_cmd=''))
+
+
+def get_ctu_config_from_json(ctu_conf_json):
+    """ CTU configuration is created from the chosen phases and dir. """
+
+    ctu_config = json.loads(ctu_conf_json)
+    # Recover namedtuple from json when coming from analyze-cc or analyze-c++
+    return CtuConfig(collect=ctu_config[0],
+                     analyze=ctu_config[1],
+                     dir=ctu_config[2],
+                     func_map_cmd=ctu_config[3])
+
+
+def create_global_ctu_function_map(func_map_lines):
+    """ Takes iterator of individual function maps and creates a global map
+    keeping only unique names. We leave conflicting names out of CTU.
+
+    :param func_map_lines: Contains the id of a function (mangled name) and
+    the originating source (the corresponding AST file) name.
+    :type func_map_lines: Iterator of str.
+    :returns: Mangled name - AST file pairs.
+    :rtype: List of (str, str) tuples.
+    """
+
+    mangled_to_asts = defaultdict(set)
+
+    for line in func_map_lines:
+        mangled_name, ast_file = line.strip().split(' ', 1)
+        mangled_to_asts[mangled_name].add(ast_file)
+
+    mangled_ast_pairs = []
+
+    for mangled_name, ast_files in mangled_to_asts.items():
+        if len(ast_files) == 1:
+            mangled_ast_pairs.append((mangled_name, next(iter(ast_files))))
+
+    return mangled_ast_pairs
+
+
+def merge_ctu_func_maps(ctudir):
+    """ Merge individual function maps into a global one.
+
+    As the collect phase runs parallel on multiple threads, all compilation
+    units are separately mapped into a temporary file in CTU_TEMP_FNMAP_FOLDER.
+    These function maps contain the mangled names of functions and the source
+    (AST generated from the source) which had them.
+    These files should be merged at the end into a global map file:
+    CTU_FUNCTION_MAP_FILENAME."""
+
+    def generate_func_map_lines(fnmap_dir):
+        """ Iterate over all lines of input files in a determined order. """
+
+        files = glob.glob(os.path.join(fnmap_dir, '*'))
+        files.sort()
+        for filename in files:
+            with open(filename, 'r') as in_file:
+                for line in in_file:
+                    yield line
+
+    def write_global_map(arch, mangled_ast_pairs):
+        """ Write (mangled function name, ast file) pairs into final file. """
+
+        extern_fns_map_file = os.path.join(ctudir, arch,
+                                           CTU_FUNCTION_MAP_FILENAME)
+        with open(extern_fns_map_file, 'w') as out_file:
+            for mangled_name, ast_file in mangled_ast_pairs:
+                out_file.write('%s %s\n' % (mangled_name, ast_file))
+
+    triple_arches = glob.glob(os.path.join(ctudir, '*'))
+    for triple_path in triple_arches:
+        if os.path.isdir(triple_path):
+            triple_arch = os.path.basename(triple_path)
+            fnmap_dir = os.path.join(ctudir, triple_arch,
+                                     CTU_TEMP_FNMAP_FOLDER)
+
+            func_map_lines = generate_func_map_lines(fnmap_dir)
+            mangled_ast_pairs = create_global_ctu_function_map(func_map_lines)
+            write_global_map(triple_arch, mangled_ast_pairs)
+
+            # Remove all temporary files
+            shutil.rmtree(fnmap_dir, ignore_errors=True)
+
+
 def run_analyzer_parallel(args):
     """ Runs the analyzer against the given compilation database. """
 
@@ -109,7 +217,8 @@
         'output_format': args.output_format,
         'output_failures': args.output_failures,
         'direct_args': analyzer_params(args),
-        'force_debug': args.force_debug
+        'force_debug': args.force_debug,
+        'ctu': get_ctu_config_from_args(args)
     }
 
     logging.debug('run analyzer against compilation database')
@@ -127,6 +236,38 @@
         pool.join()
 
 
+def govern_analyzer_runs(args):
+    """ Governs multiple runs in CTU mode or runs once in normal mode. """
+
+    ctu_config = get_ctu_config_from_args(args)
+    # If we do a CTU collect (1st phase) we remove all previous collection
+    # data first.
+    if ctu_config.collect:
+        shutil.rmtree(ctu_config.dir, ignore_errors=True)
+
+    # If the user asked for a collect (1st) and analyze (2nd) phase, we do an
+    # all-in-one run where we deliberately remove collection data before and
+    # also after the run. If the user asks only for a single phase data is
+    # left so multiple analyze runs can use the same data gathered by a single
+    # collection run.
+    if ctu_config.collect and ctu_config.analyze:
+        # CTU strings are coming from args.ctu_dir and func_map_cmd,
+        # so we can leave it empty
+        args.ctu_phases = CtuConfig(collect=True, analyze=False,
+                                    dir='', func_map_cmd='')
+        run_analyzer_parallel(args)
+        merge_ctu_func_maps(ctu_config.dir)
+        args.ctu_phases = CtuConfig(collect=False, analyze=True,
+                                    dir='', func_map_cmd='')
+        run_analyzer_parallel(args)
+        shutil.rmtree(ctu_config.dir, ignore_errors=True)
+    else:
+        # Single runs (collect or analyze) are launched from here.
+        run_analyzer_parallel(args)
+        if ctu_config.collect:
+            merge_ctu_func_maps(ctu_config.dir)
+
+
 def setup_environment(args):
     """ Set up environment for build command to interpose compiler wrapper. """
 
@@ -140,7 +281,8 @@
         'ANALYZE_BUILD_REPORT_FORMAT': args.output_format,
         'ANALYZE_BUILD_REPORT_FAILURES': 'yes' if args.output_failures else '',
         'ANALYZE_BUILD_PARAMETERS': ' '.join(analyzer_params(args)),
-        'ANALYZE_BUILD_FORCE_DEBUG': 'yes' if args.force_debug else ''
+        'ANALYZE_BUILD_FORCE_DEBUG': 'yes' if args.force_debug else '',
+        'ANALYZE_BUILD_CTU': json.dumps(get_ctu_config_from_args(args))
     })
     return environment
 
@@ -173,7 +315,8 @@
                                  '').split(' '),
         'force_debug': os.getenv('ANALYZE_BUILD_FORCE_DEBUG'),
         'directory': execution.cwd,
-        'command': [execution.cmd[0], '-c'] + compilation.flags
+        'command': [execution.cmd[0], '-c'] + compilation.flags,
+        'ctu': get_ctu_config_from_json(os.getenv('ANALYZE_BUILD_CTU'))
     }
     # call static analyzer against the compilation
     for source in compilation.files:
@@ -223,14 +366,6 @@
     """ A group of command line arguments can mapped to command
     line arguments of the analyzer. This method generates those. """
 
-    def prefix_with(constant, pieces):
-        """ From a sequence create another sequence where every second element
-        is from the original sequence and the odd elements are the prefix.
-
-        eg.: prefix_with(0, [1,2,3]) creates [0, 1, 0, 2, 0, 3] """
-
-        return [elem for piece in pieces for elem in [constant, piece]]
-
     result = []
 
     if args.store_model:
@@ -294,8 +429,9 @@
           'direct_args',  # arguments from command line
           'force_debug',  # kill non debug macros
           'output_dir',  # where generated report files shall go
-          'output_format',  # it's 'plist' or 'html' or both
-          'output_failures'])  # generate crash reports or not
+          'output_format',  # it's 'plist', 'html', both or plist-multi-file
+          'output_failures',  # generate crash reports or not
+          'ctu'])  # ctu control options
 def run(opts):
     """ Entry point to run (or not) static analyzer against a single entry
     of the compilation database.
@@ -383,7 +519,10 @@
 
     def target():
         """ Creates output file name for reports. """
-        if opts['output_format'] in {'plist', 'plist-html'}:
+        if opts['output_format'] in {
+                'plist',
+                'plist-html',
+                'plist-multi-file'}:
             (handle, name) = tempfile.mkstemp(prefix='report-',
                                               suffix='.plist',
                                               dir=opts['output_dir'])
@@ -407,8 +546,109 @@
         return result
 
 
+def func_map_list_src_to_ast(func_src_list):
+    """ Turns textual function map list with source files into a
+    function map list with ast files. """
+
+    func_ast_list = []
+    for fn_src_txt in func_src_list:
+        mangled_name, path = fn_src_txt.split(" ", 1)
+        # Normalize path on windows as well
+        path = os.path.splitdrive(path)[1]
+        # Make relative path out of absolute
+        path = path[1:] if path[0] == os.sep else path
+        ast_path = os.path.join("ast", path + ".ast")
+        func_ast_list.append(mangled_name + " " + ast_path)
+    return func_ast_list
+
+
+@require(['clang', 'directory', 'flags', 'direct_args', 'file', 'ctu'])
+def ctu_collect_phase(opts):
+    """ Preprocess source by generating all data needed by CTU analysis. """
+
+    def generate_ast(triple_arch):
+        """ Generates ASTs for the current compilation command. """
+
+        args = opts['direct_args'] + opts['flags']
+        ast_joined_path = os.path.join(opts['ctu'].dir, triple_arch, 'ast',
+                                       os.path.realpath(opts['file'])[1:] +
+                                       '.ast')
+        ast_path = os.path.abspath(ast_joined_path)
+        ast_dir = os.path.dirname(ast_path)
+        if not os.path.isdir(ast_dir):
+            try:
+                os.makedirs(ast_dir)
+            except OSError:
+                # In case an other process already created it.
+                pass
+        ast_command = [opts['clang'], '-emit-ast']
+        ast_command.extend(args)
+        ast_command.append('-w')
+        ast_command.append(opts['file'])
+        ast_command.append('-o')
+        ast_command.append(ast_path)
+        logging.debug("Generating AST using '%s'", ast_command)
+        run_command(ast_command, cwd=opts['directory'])
+
+    def map_functions(triple_arch):
+        """ Generate function map file for the current source. """
+
+        args = opts['direct_args'] + opts['flags']
+        funcmap_command = [opts['ctu'].func_map_cmd]
+        funcmap_command.append(opts['file'])
+        funcmap_command.append('--')
+        funcmap_command.extend(args)
+        logging.debug("Generating function map using '%s'", funcmap_command)
+        func_src_list = run_command(funcmap_command, cwd=opts['directory'])
+        func_ast_list = func_map_list_src_to_ast(func_src_list)
+        extern_fns_map_folder = os.path.join(opts['ctu'].dir, triple_arch,
+                                             CTU_TEMP_FNMAP_FOLDER)
+        if not os.path.isdir(extern_fns_map_folder):
+            try:
+                os.makedirs(extern_fns_map_folder)
+            except OSError:
+                # In case an other process already created it.
+                pass
+        if func_ast_list:
+            with tempfile.NamedTemporaryFile(mode='w',
+                                             dir=extern_fns_map_folder,
+                                             delete=False) as out_file:
+                out_file.write("\n".join(func_ast_list) + "\n")
+
+    cwd = opts['directory']
+    cmd = [opts['clang'], '--analyze'] + opts['direct_args'] + opts['flags'] \
+        + [opts['file']]
+    triple_arch = get_triple_arch(cmd, cwd)
+    generate_ast(triple_arch)
+    map_functions(triple_arch)
+
+
+@require(['ctu'])
+def dispatch_ctu(opts, continuation=run_analyzer):
+    """ Execute only one phase of 2 phases of CTU if needed. """
+
+    ctu_config = opts['ctu']
+
+    if ctu_config.collect or ctu_config.analyze:
+        assert ctu_config.collect != ctu_config.analyze
+        if ctu_config.collect:
+            return ctu_collect_phase(opts)
+        if ctu_config.analyze:
+            cwd = opts['directory']
+            cmd = [opts['clang'], '--analyze'] + opts['direct_args'] \
+                + opts['flags'] + [opts['file']]
+            triarch = get_triple_arch(cmd, cwd)
+            ctu_options = ['ctu-dir=' + os.path.join(ctu_config.dir, triarch),
+                           'experimental-enable-naive-ctu-analysis=true']
+            analyzer_options = prefix_with('-analyzer-config', ctu_options)
+            direct_options = prefix_with('-Xanalyzer', analyzer_options)
+            opts['direct_args'].extend(direct_options)
+
+    return continuation(opts)
+
+
 @require(['flags', 'force_debug'])
-def filter_debug_flags(opts, continuation=run_analyzer):
+def filter_debug_flags(opts, continuation=dispatch_ctu):
     """ Filter out nondebug macros when requested. """
 
     if opts.pop('force_debug'):
@@ -475,6 +715,7 @@
         logging.debug('analysis, on default arch')
         return continuation(opts)
 
+
 # To have good results from static analyzer certain compiler options shall be
 # omitted. The compiler flag filtering only affects the static analyzer run.
 #
diff --git a/libscanbuild/arguments.py b/libscanbuild/arguments.py
index 2735123..00679a4 100644
--- a/libscanbuild/arguments.py
+++ b/libscanbuild/arguments.py
@@ -18,8 +18,8 @@
 import argparse
 import logging
 import tempfile
-from libscanbuild import reconfigure_logging
-from libscanbuild.clang import get_checkers
+from libscanbuild import reconfigure_logging, CtuConfig
+from libscanbuild.clang import get_checkers, is_ctu_capable
 
 __all__ = ['parse_args_for_intercept_build', 'parse_args_for_analyze_build',
            'parse_args_for_scan_build']
@@ -98,6 +98,11 @@
         # add cdb parameter invisibly to make report module working.
         args.cdb = 'compile_commands.json'
 
+    # Make ctu_dir an abspath as it is needed inside clang
+    if not from_build_command and hasattr(args, 'ctu_phases') \
+            and hasattr(args.ctu_phases, 'dir'):
+        args.ctu_dir = os.path.abspath(args.ctu_dir)
+
 
 def validate_args_for_analyze(parser, args, from_build_command):
     """ Command line parsing is done by the argparse module, but semantic
@@ -122,6 +127,18 @@
     elif not from_build_command and not os.path.exists(args.cdb):
         parser.error(message='compilation database is missing')
 
+    # If the user wants CTU mode
+    if not from_build_command and hasattr(args, 'ctu_phases') \
+            and hasattr(args.ctu_phases, 'dir'):
+        # If CTU analyze_only, the input directory should exist
+        if args.ctu_phases.analyze and not args.ctu_phases.collect \
+                and not os.path.exists(args.ctu_dir):
+            parser.error(message='missing CTU directory')
+        # Check CTU capability via checking clang-func-mapping
+        if not is_ctu_capable(args.func_map_cmd):
+            parser.error(message="""This version of clang does not support CTU
+            functionality or clang-func-mapping command not found.""")
+
 
 def create_intercept_parser():
     """ Creates a parser for command-line arguments to 'intercept'. """
@@ -218,7 +235,15 @@
         default='html',
         action='store_const',
         help="""Cause the results as a set of .html and .plist files.""")
-    # TODO: implement '-view '
+    format_group.add_argument(
+        '--plist-multi-file',
+        '-plist-multi-file',
+        dest='output_format',
+        const='plist-multi-file',
+        default='html',
+        action='store_const',
+        help="""Cause the results as a set of .plist files with extra
+        information on related files.""")
 
     advanced = parser.add_argument_group('advanced options')
     advanced.add_argument(
@@ -333,6 +358,51 @@
     if from_build_command:
         parser.add_argument(
             dest='build', nargs=argparse.REMAINDER, help="""Command to run.""")
+    else:
+        ctu = parser.add_argument_group('cross translation unit analysis')
+        ctu_mutex_group = ctu.add_mutually_exclusive_group()
+        ctu_mutex_group.add_argument(
+            '--ctu',
+            action='store_const',
+            const=CtuConfig(collect=True, analyze=True,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform cross translation unit (ctu) analysis (both collect
+            and analyze phases) using default <ctu-dir> for temporary output.
+            At the end of the analysis, the temporary directory is removed.""")
+        ctu.add_argument(
+            '--ctu-dir',
+            metavar='<ctu-dir>',
+            dest='ctu_dir',
+            default='ctu-dir',
+            help="""Defines the temporary directory used between ctu
+            phases.""")
+        ctu_mutex_group.add_argument(
+            '--ctu-collect-only',
+            action='store_const',
+            const=CtuConfig(collect=True, analyze=False,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform only the collect phase of ctu.
+            Keep <ctu-dir> for further use.""")
+        ctu_mutex_group.add_argument(
+            '--ctu-analyze-only',
+            action='store_const',
+            const=CtuConfig(collect=False, analyze=True,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform only the analyze phase of ctu. <ctu-dir> should be
+            present and will not be removed after analysis.""")
+        ctu.add_argument(
+            '--use-func-map-cmd',
+            metavar='<path>',
+            dest='func_map_cmd',
+            default='clang-func-mapping',
+            help="""'%(prog)s' uses the 'clang-func-mapping' executable
+            relative to itself for generating function maps for static
+            analysis. One can override this behavior with this option by using
+            the 'clang-func-mapping' packaged with Xcode (on OS X) or from the
+            PATH.""")
     return parser
 
 
diff --git a/libscanbuild/clang.py b/libscanbuild/clang.py
index 192e708..ab42206 100644
--- a/libscanbuild/clang.py
+++ b/libscanbuild/clang.py
@@ -8,11 +8,13 @@
 Since Clang command line interface is so rich, but this project is using only
 a subset of that, it makes sense to create a function specific wrapper. """
 
+import subprocess
 import re
 from libscanbuild import run_command
 from libscanbuild.shell import decode
 
-__all__ = ['get_version', 'get_arguments', 'get_checkers']
+__all__ = ['get_version', 'get_arguments', 'get_checkers', 'is_ctu_capable',
+           'get_triple_arch']
 
 # regex for activated checker
 ACTIVE_CHECKER_PATTERN = re.compile(r'^-analyzer-checker=(.*)$')
@@ -152,3 +154,26 @@
         raise Exception('Could not query Clang for available checkers.')
 
     return checkers
+
+
+def is_ctu_capable(func_map_cmd):
+    """ Detects if the current (or given) clang and function mapping
+    executables are CTU compatible. """
+
+    try:
+        run_command([func_map_cmd, '-version'])
+    except (OSError, subprocess.CalledProcessError):
+        return False
+    return True
+
+
+def get_triple_arch(command, cwd):
+    """Returns the architecture part of the target triple for the given
+    compilation command. """
+
+    cmd = get_arguments(command, cwd)
+    try:
+        separator = cmd.index("-triple")
+        return cmd[separator + 1]
+    except (IndexError, ValueError):
+        return ""
diff --git a/libscanbuild/report.py b/libscanbuild/report.py
index 54b9695..b3753c1 100644
--- a/libscanbuild/report.py
+++ b/libscanbuild/report.py
@@ -13,7 +13,6 @@
 import os.path
 import sys
 import shutil
-import itertools
 import plistlib
 import glob
 import json
@@ -255,24 +254,29 @@
 
 
 def read_bugs(output_dir, html):
+    # type: (str, bool) -> Generator[Dict[str, Any], None, None]
     """ Generate a unique sequence of bugs from given output directory.
 
     Duplicates can be in a project if the same module was compiled multiple
     times with different compiler options. These would be better to show in
     the final report (cover) only once. """
 
-    parser = parse_bug_html if html else parse_bug_plist
-    pattern = '*.html' if html else '*.plist'
+    def empty(file_name):
+        return os.stat(file_name).st_size == 0
 
     duplicate = duplicate_check(
         lambda bug: '{bug_line}.{bug_path_length}:{bug_file}'.format(**bug))
 
-    bugs = itertools.chain.from_iterable(
-        # parser creates a bug generator not the bug itself
-        parser(filename)
-        for filename in glob.iglob(os.path.join(output_dir, pattern)))
+    # get the right parser for the job.
+    parser = parse_bug_html if html else parse_bug_plist
+    # get the input files, which are not empty.
+    pattern = os.path.join(output_dir, '*.html' if html else '*.plist')
+    bug_files = (file for file in glob.iglob(pattern) if not empty(file))
 
-    return (bug for bug in bugs if not duplicate(bug))
+    for bug_file in bug_files:
+        for bug in parser(bug_file):
+            if not duplicate(bug):
+                yield bug
 
 
 def parse_bug_plist(filename):
diff --git a/tests/unit/test_analyze.py b/tests/unit/test_analyze.py
index a250ff2..9964a29 100644
--- a/tests/unit/test_analyze.py
+++ b/tests/unit/test_analyze.py
@@ -4,12 +4,12 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-import libear
-import libscanbuild.analyze as sut
 import unittest
 import re
 import os
 import os.path
+import libear
+import libscanbuild.analyze as sut
 
 
 class ReportDirectoryTest(unittest.TestCase):
@@ -333,3 +333,83 @@
 
     def test_method_exception_not_caught(self):
         self.assertRaises(Exception, method_exception_from_inside, dict())
+
+
+class PrefixWithTest(unittest.TestCase):
+
+    def test_gives_empty_on_empty(self):
+        res = sut.prefix_with(0, [])
+        self.assertFalse(res)
+
+    def test_interleaves_prefix(self):
+        res = sut.prefix_with(0, [1, 2, 3])
+        self.assertListEqual([0, 1, 0, 2, 0, 3], res)
+
+
+class MergeCtuMapTest(unittest.TestCase):
+
+    def test_no_map_gives_empty(self):
+        pairs = sut.create_global_ctu_function_map([])
+        self.assertFalse(pairs)
+
+    def test_multiple_maps_merged(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun3#I# ast/fun3.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun3#I#', 'ast/fun3.c.ast') in pairs)
+        self.assertEqual(3, len(pairs))
+
+    def test_not_unique_func_left_out(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun1#I# ast/fun7.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertFalse(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertFalse(('c:@F@fun1#I#', 'ast/fun7.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertEqual(1, len(pairs))
+
+    def test_duplicates_are_kept(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun1#I# ast/fun1.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertEqual(2, len(pairs))
+
+    def test_space_handled_in_source(self):
+        concat_map = ['c:@F@fun1#I# ast/f un.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/f un.c.ast') in pairs)
+        self.assertEqual(1, len(pairs))
+
+
+class FuncMapSrcToAstTest(unittest.TestCase):
+
+    def test_empty_gives_empty(self):
+        fun_ast_lst = sut.func_map_list_src_to_ast([])
+        self.assertFalse(fun_ast_lst)
+
+    def test_sources_to_asts(self):
+        fun_src_lst = ['c:@F@f1#I# ' + os.path.join(os.sep + 'path', 'f1.c'),
+                       'c:@F@f2#I# ' + os.path.join(os.sep + 'path', 'f2.c')]
+        fun_ast_lst = sut.func_map_list_src_to_ast(fun_src_lst)
+        self.assertTrue('c:@F@f1#I# ' +
+                        os.path.join('ast', 'path', 'f1.c.ast')
+                        in fun_ast_lst)
+        self.assertTrue('c:@F@f2#I# ' +
+                        os.path.join('ast', 'path', 'f2.c.ast')
+                        in fun_ast_lst)
+        self.assertEqual(2, len(fun_ast_lst))
+
+    def test_spaces_handled(self):
+        fun_src_lst = ['c:@F@f1#I# ' + os.path.join(os.sep + 'path', 'f 1.c')]
+        fun_ast_lst = sut.func_map_list_src_to_ast(fun_src_lst)
+        self.assertTrue('c:@F@f1#I# ' +
+                        os.path.join('ast', 'path', 'f 1.c.ast')
+                        in fun_ast_lst)
+        self.assertEqual(1, len(fun_ast_lst))
diff --git a/tests/unit/test_clang.py b/tests/unit/test_clang.py
index eef8c26..07ac4d9 100644
--- a/tests/unit/test_clang.py
+++ b/tests/unit/test_clang.py
@@ -92,3 +92,15 @@
         self.assertEqual('Checker One description', result.get('checker.one'))
         self.assertTrue('checker.two' in result)
         self.assertEqual('Checker Two description', result.get('checker.two'))
+
+
+class ClangIsCtuCapableTest(unittest.TestCase):
+    def test_ctu_not_found(self):
+        is_ctu = sut.is_ctu_capable('not-found-clang-func-mapping')
+        self.assertFalse(is_ctu)
+
+
+class ClangGetTripleArchTest(unittest.TestCase):
+    def test_arch_is_not_empty(self):
+        arch = sut.get_triple_arch(['clang', '-E', '-'], '.')
+        self.assertTrue(len(arch) > 0)