summaryrefslogtreecommitdiff
path: root/buildscripts/iwyu/run_iwyu_analysis.py
blob: 85606056cc178b41a97b1f239932cdf45558d639 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
#!/usr/bin/env python3
"""
TOOL FUNCTIONAL DESCRIPTION.

Currently the tool works by running IWYU on a subset of compile_commands.json
(the ones we care about like checked in mongo source) and testing each change
in a copy of the original source/header tree so that other compiles are not
affected until it passes a normal compile itself. Due to header dependencies
we must recompile the source files to catch issue IWYU may have introduced
with some dependent header change. Header dependencies do not form a DAG so
we can not process sources in a deterministic fashion. The tool will loop
through all the compilations until all dependents in a compilation are
determined unchanged from the last time the compilation was performed.

The general workflow used here is to run the tool till there no changes
(several hours on rhel-xxlarge) and fix the errors either in the tool config
or as a manual human change in the code.

TOOL TECHNICAL DESCRIPTION:

Regarding the code layout, the main function setups a thread pool executor
and processes each source from the compile_commands. From there it runs a
thread function and within that 5 parts (each there own function) for
each source file:

1. Skip if deps are unchanged
2. Get the headers deps via -MMD
3. Run IWYU
4. Apply Fixes
5. test compile, record new header deps if passed

The tool uses mtime and MD5 hashing to know if any header dep has changed.

"""

import argparse
import json
import subprocess
import tempfile
import shlex
import os
import re
import concurrent.futures
import hashlib
import atexit
import traceback
import threading
import shutil
import signal
import sys
import yaml
import enum
from dataclasses import dataclass, asdict
from typing import Dict, List, Any, Optional, Callable, Union, Tuple

from tqdm import tqdm
from colorama import init as colorama_init
from colorama import Fore

colorama_init()

parser = argparse.ArgumentParser(description='Run include what you use and test output')

parser.add_argument('--compile-commands', metavar='FILE', type=str, default='compile_commands.json',
                    help='Path to the compile commands file to use.')
parser.add_argument(
    '--check', action='store_true', help=
    'Enables check mode, which does not apply fixes and only runs to see if any files produce IWYU changes. Exit 0 if no new changes detected.'
)
parser.add_argument(
    '--config-file', metavar='FILE', type=str, default="", help=
    'Enables check mode, which does not apply fixes and only runs to see if any files produce IWYU changes. Exit 0 if no new changes detected.'
)
parser.add_argument(
    '--iwyu-data', metavar='FILE', type=str, default='iwyu.dat',
    help='Location of data used by IWYU, contains hash and status info about all files.')
parser.add_argument(
    '--keep-going', action='store_true', help=
    'Do not stop on errors, instead resubmit the job to try again later (after things may have been fixed elsewhere)'
)
parser.add_argument(
    '--cycle-debugging', action='store_true', help=
    'Once a cycle has been detected, each directory tree for each step in the cycle will be saved to a .cycle directory.'
)
parser.add_argument('--verbose', action='store_true',
                    help='Prints more info about what is taking place.')
parser.add_argument('--mongo-toolchain-bin-dir', type=str,
                    help='Which toolchain bin directory to use for this analysis.',
                    default='/opt/mongodbtoolchain/v4/bin')
parser.add_argument(
    '--start-ratio', type=float, help=
    'decimal value between 0 and 1 which indicates what starting ratio index of the total compile commands to run over, can not be greater than the --end-ratio.',
    default=0.0)
parser.add_argument(
    '--end-ratio', type=float, help=
    'decimal value between 0 and 1 which indicates what ending ratio index of the total compile commands to run over, can not be less than the --start-ratio.',
    default=1.0)
command_line_args = parser.parse_args()

# the current state of all files, contain the cmd_entry, hashes, successes
IWYU_ANALYSIS_STATE: Dict[str, Any] = {}

# the current state cycles being tracked
IWYU_CYCLE_STATE: Dict[str, Any] = {}

hash_lookup_locks: Dict[str, threading.Lock] = {}
mtime_hash_lookup: Dict[str, Dict[str, Any]] = {}

if command_line_args.config_file:
    config_file = command_line_args.config_file
else:
    config_file = os.path.join(os.path.dirname(__file__), "iwyu_config.yml")

with open(config_file, "r") as stream:
    config = yaml.safe_load(stream)
    for key, value in config.items():
        if value is None:
            config[key] = []

IWYU_OPTIONS = config.get('iwyu_options', [])
IWYU_FIX_OPTIONS = config.get('fix_options', [])
NO_INCLUDES = config.get('no_includes', [])
KEEP_INCLUDES = config.get('keep_includes', [])
SKIP_FILES = tuple(config.get('skip_files', []))
CYCLE_FILES: List[str] = []


@dataclass
class CompileCommand:
    """An entry from compile_commands.json."""

    file: str
    command: str
    directory: str
    output: str


class ResultType(enum.Enum):
    """
    Descriptions of enums.

    ERROR: unexpected or unrecognized error cases
    FAILED: the IWYU task for a given compile command entry failed
    NO_CHANGE: the input header tree and source file have not changed since last time
    NOT_RUNNING: sources which we intentionally skip running IWYU all together
    RESUBMIT: the IWYU task failed, but it may work later after other header changes
    SUCCESS: the IWYU task for a source file has succeeded
    """

    ERROR = enum.auto()
    FAILED = enum.auto()
    NO_CHANGE = enum.auto()
    NOT_RUNNING = enum.auto()
    RESUBMIT = enum.auto()
    SUCCESS = enum.auto()


TOOLCHAIN_DIR = command_line_args.mongo_toolchain_bin_dir
SHUTDOWN_FLAG = False
CLANG_INCLUDES = None
IWYU_OPTIONS = [val for pair in zip(['-Xiwyu'] * len(IWYU_OPTIONS), IWYU_OPTIONS) for val in pair]
if NO_INCLUDES:
    NO_INCLUDE_REGEX = re.compile(r'^\s*#include\s+\"(' + '|'.join(NO_INCLUDES) + ')\"')
if KEEP_INCLUDES:
    KEEP_INCLUDE_REGEX = re.compile(r'^\s*#include\s+(' + '|'.join(KEEP_INCLUDES) + ')')
CHANGED_FILES_REGEX = re.compile(r"^The\sfull\sinclude-list\sfor\s(.+):$", re.MULTILINE)


def printer(message: str) -> None:
    """
    Prints output as appropriate.

    We don't print output if we are shutting down because the logs will
    explode and original error will be hard to locate.
    """

    if not SHUTDOWN_FLAG or command_line_args.verbose:
        tqdm.write(str(message))


def debug_printer(message: str) -> None:
    """Print each step in the processing of IWYU."""

    if command_line_args.verbose:
        tqdm.write(str(message))


def failed_return() -> ResultType:
    """A common method to allow the processing to continue even after some file fails."""

    if command_line_args.keep_going:
        return ResultType.RESUBMIT
    else:
        return ResultType.FAILED


def in_project_root(file: str) -> bool:
    """
    Return true if the file is in the project root.

    This is assuming the project root is the same location
    as the compile_commands.json file (the format of compile_commands.json
    expects this as well).
    """

    return os.path.abspath(file).startswith(
        os.path.abspath(os.path.dirname(command_line_args.compile_commands)))


def copy_error_state(cmd_entry: CompileCommand, test_dir: str,
                     dir_ext: str = '.iwyu_test_dir') -> Optional[str]:
    """
    When we fail, we want to copy the current state of the temp dir.

    This is so that the command that was used can be replicated and rerun,
    primarily for debugging purposes.
    """

    # we never use a test_dir in check mode, since no files are copied in that mode.
    if command_line_args.check:
        return None

    # make a directory in the output location that we can store the state of the the
    # header dep and source file the compile command was run with, delete old results
    base, _ = os.path.splitext(cmd_entry.output)
    if os.path.exists(base + dir_ext):
        shutil.rmtree(base + dir_ext)
    os.makedirs(base + dir_ext, exist_ok=True)
    basedir = os.path.basename(test_dir)
    error_state_dir = os.path.join(base + dir_ext, basedir)
    shutil.copytree(test_dir, error_state_dir)
    return error_state_dir


def calc_hash_of_file(file: str) -> str:
    """
    Calculate the hash of a file. Use mtime as well.

    If the mtime is unchanged, don't do IO, just look up the last hash.
    """

    # we need to lock on specific file io because GIL does not cover system io, so two threads
    # could be doing io on the same file at the same time.
    if file not in hash_lookup_locks:
        hash_lookup_locks[file] = threading.Lock()
    with hash_lookup_locks[file]:
        if file in mtime_hash_lookup and os.path.getmtime(file) == mtime_hash_lookup[file]['mtime']:
            return mtime_hash_lookup[file]['hash']
        else:
            hash_val = hashlib.md5(open(file, 'rb').read()).hexdigest()
            mtime_hash_lookup[file] = {'mtime': os.path.getmtime(file), 'hash': hash_val}
            return hash_val


def find_no_include(line: str, lines: List[str], output_lines: List[str]) -> bool:
    """
    We need to regex the line to see if it includes an include that matches our NO_INCLUDE_REGEX.

    If so then we do not include that line
    when we rewrite the file, and instead we add a IWYU no_include pragma inplace
    """

    no_include_header_found = False
    no_include_header = re.findall(NO_INCLUDE_REGEX, line)

    if no_include_header:
        no_include_header_found = True
        no_include_line = f'// IWYU pragma: no_include "{no_include_header[0]}"\n'
        if no_include_line not in lines:
            output_lines.append(no_include_line)
    return no_include_header_found


def add_pragmas(source_files: List[str]):
    """
    We automate some of the pragmas so there is not so much manual work.

    There are general cases for some of the pragmas. In this case we open the target
    source/header, search via regexes for specific includes we care about, then add
    the pragma comments as necessary.
    """

    for source_file in source_files:

        # before we run IWYU, we take a guess at the likely header by swapping .cpp for .h
        # so it may not be a real header. After IWYU runs we know exactly where to add the pragmas
        # in case we got it wrong the first time around
        if not os.path.exists(source_file):
            continue

        # we load in the file content operate on it, and then write it back out
        output_lines: List[str] = []
        with open(source_file, 'r') as fin:
            file_lines = fin.readlines()
            for line in file_lines:

                if NO_INCLUDES and find_no_include(line, file_lines, output_lines):
                    continue

                if KEEP_INCLUDES and re.search(KEEP_INCLUDE_REGEX,
                                               line) and '// IWYU pragma: keep' not in line:

                    output_lines.append(line.strip() + " // IWYU pragma: keep\n")
                    continue

                output_lines.append(line)

        with open(source_file, 'w') as fout:
            for line in output_lines:
                fout.write(line)


def recalc_hashes(deps: List[str], change_dir: Optional[str] = None) -> Dict[str, Any]:
    """
    We calculate the hashes from the header dep list generated by the compiler.

    We also create cumulative hash for convenance.

    Some cases we are operating a test directory, but deps are referenced as if they are
    in the project root. The change_dir option here allows us to calc the the hashes from
    the test directory we may be working in, but still record the deps files in a compat
    fashion with other processes that work out of project root, e.g. testing if there was a
    change from last time.
    """

    hashes: Dict[str, Any] = {'deps': {}}
    full_hash = hashlib.new('md5')
    for dep in sorted(list(deps)):
        if not in_project_root(dep):
            continue
        if change_dir:
            orig_dep = dep
            dep = os.path.join(change_dir, dep)
        dep_hash = calc_hash_of_file(dep)
        if change_dir:
            dep = orig_dep
        full_hash.update(dep_hash.encode('utf-8'))
        hashes['deps'][dep] = dep_hash
    hashes['full_hash'] = full_hash.hexdigest()
    return hashes


def setup_test_dir(cmd_entry: CompileCommand, test_dir: str) -> List[str]:
    """
    Here we are copying the source and required header tree from the main source tree.

    Returns the associate source and header that were copied into the test dir.

    We want an isolated location to perform analysis and apply changes so everything is not
    clashing. At this point we don't know for sure what header IWYU is going to associate with the source
    but for mongo codebase, 99.9% of the time its just swap the .cpp for .h. We need this to apply
    some pragma to keep IWYU from removing headers it doesn't understand (cross platform or
    third party like boost or asio). The pragmas are harmless in and of themselves so adding the
    mistakenly in the 0.1% of the time is negligible.
    """

    original_sources = [
        orig_source for orig_source in [cmd_entry.file,
                                        os.path.splitext(cmd_entry.file)[0] + '.h']
        if os.path.exists(orig_source)
    ]
    test_source_files = [os.path.join(test_dir, source_file) for source_file in original_sources]
    dep_headers = [dep for dep in IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'].keys()]

    # copy each required header from our source tree into our test dir
    # this does cost some time, but the alternative (everything operating in the real source tree)
    # was much longer due to constant failures.
    for source_file in dep_headers + ['etc/iwyu_mapping.imp']:
        if in_project_root(source_file):
            os.makedirs(os.path.join(test_dir, os.path.dirname(source_file)), exist_ok=True)
            shutil.copyfile(source_file, os.path.join(test_dir, source_file))

    # need to create dirs for outputs
    for output in shlex.split(cmd_entry.output):
        os.makedirs(os.path.join(test_dir, os.path.dirname(output)), exist_ok=True)

    return test_source_files


def get_clang_includes() -> List[str]:
    """
    IWYU needs some extra help to know what default includes clang is going to bring in when it normally compiles.

    The query reliably gets the include dirs that would be used in normal compiles. We cache and reuse the result
    so the subprocess only runs once.
    """
    global CLANG_INCLUDES  # pylint: disable=global-statement
    if CLANG_INCLUDES is None:
        clang_includes = subprocess.getoutput(
            f"{TOOLCHAIN_DIR}/clang++ -Wp,-v -x c++ - -fsyntax-only < /dev/null 2>&1 | sed -e '/^#include <...>/,/^End of search/{{ //!b }};d'"
        ).split('\n')
        clang_includes = ['-I' + include.strip() for include in clang_includes]
        CLANG_INCLUDES = clang_includes
    return CLANG_INCLUDES


def write_cycle_diff(source_file: str, cycle_dir: str, latest_hashes: Dict[str, Any]) -> None:
    """
    Write out the diffs between the last iteration and the latest iteration.

    The file contains the hash for before and after for each file involved in the compilation.
    """

    with open(os.path.join(cycle_dir, 'hashes_diff.txt'), 'w') as out:
        dep_list = set(
            list(IWYU_ANALYSIS_STATE[source_file]['hashes']['deps'].keys()) +
            list(latest_hashes['deps'].keys()))
        not_found_str = "not found" + (" " * 23)
        for dep in sorted(dep_list):
            out.write(
                f"Original: {IWYU_ANALYSIS_STATE[source_file]['hashes']['deps'].get(dep, not_found_str)}, Latest: {latest_hashes['deps'].get(dep, not_found_str)} - {dep}\n"
            )


def check_for_cycles(cmd_entry: CompileCommand, latest_hashes: Dict[str, Any],
                     test_dir: str) -> Optional[ResultType]:
    """
    IWYU can induce cycles so we should check our previous results to see if a cycle has occurred.

    These cycles can happen if a header change induces some other header change which then inturn induces
    the original header change. These cycles are generally harmless and are easily broken with a keep
    pragma but finding what files are induces the cycle is the challenge.

    With cycle debug mode enabled, the entire header tree is saved for each iteration in the cycle so
    all files can be fully examined.
    """

    if cmd_entry.file not in IWYU_CYCLE_STATE:
        IWYU_CYCLE_STATE[cmd_entry.file] = {
            'cycles': [],
        }

    if latest_hashes['full_hash'] in IWYU_CYCLE_STATE[cmd_entry.file]['cycles']:
        if command_line_args.cycle_debugging:
            if 'debug_cycles' not in IWYU_CYCLE_STATE[cmd_entry.file]:
                IWYU_CYCLE_STATE[cmd_entry.file]['debug_cycles'] = {}

            IWYU_CYCLE_STATE[cmd_entry.file]['debug_cycles'][
                latest_hashes['full_hash']] = latest_hashes

            cycle_dir = copy_error_state(
                cmd_entry, test_dir, dir_ext=
                f".{latest_hashes['full_hash']}.cycle{len(IWYU_CYCLE_STATE[cmd_entry.file]['debug_cycles'])}"
            )
            write_cycle_diff(cmd_entry.file, cycle_dir, latest_hashes)
            if latest_hashes['full_hash'] not in IWYU_CYCLE_STATE[cmd_entry.file]['debug_cycles']:
                printer(f"{Fore.YELLOW}[5] - Cycle Found!: {cmd_entry.file}{Fore.RESET}")
            else:
                printer(f"{Fore.RED}[5] - Cycle Done! : {cmd_entry.file}{Fore.RESET}")
                return failed_return()
        else:
            printer(f"{Fore.RED}[5] - Cycle Found!: {cmd_entry.file}{Fore.RESET}")
            CYCLE_FILES.append(cmd_entry.file)
            return ResultType.SUCCESS
    else:
        IWYU_CYCLE_STATE[cmd_entry.file]['cycles'].append(latest_hashes['full_hash'])

    return None


def write_iwyu_data() -> None:
    """Store the data we have acquired during this run so we can resume at the same spot on subsequent runs."""

    # There might be faster ways to store this like serialization or
    # what not, but having human readable json is good for debugging.
    # on a full build this takes around 10 seconds to write out.
    if IWYU_ANALYSIS_STATE:
        try:
            # atomic move operation prevents ctrl+c mashing from
            # destroying everything, at least we can keep the original
            # data safe from emotional outbursts.
            with tempfile.NamedTemporaryFile() as temp:
                with open(temp.name, 'w') as iwyu_data_file:
                    json.dump(IWYU_ANALYSIS_STATE, iwyu_data_file, sort_keys=True, indent=4)
                shutil.move(temp.name, command_line_args.iwyu_data)
        except FileNotFoundError as exc:
            if temp.name in str(exc):
                pass


def need_to_process(cmd_entry: CompileCommand,
                    custom_printer: Callable[[str], None] = printer) -> Optional[ResultType]:
    """
    The first step in the first step for processing a given source file.

    We have a list of skip prefixes, for example build or third_party, but others can be added.

    If it is a file we are not skipping, then we check if we have already done the work by calculating the
    hashes and seeing if what we recorded last time has changed.
    """

    if cmd_entry.file.startswith(
            SKIP_FILES) or cmd_entry.file in CYCLE_FILES or '/conftest_' in cmd_entry.file:
        custom_printer(f"{Fore.YELLOW}[5] - Not running!: {cmd_entry.file}{Fore.RESET}")
        return ResultType.NOT_RUNNING

    if IWYU_ANALYSIS_STATE.get(cmd_entry.file):
        hashes = recalc_hashes(IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'].keys())

        # we only skip if the matching mode was successful last time, otherwise we assume we need to rerun
        mode_success = 'CHECK' if command_line_args.check else 'FIX'
        if command_line_args.verbose:
            diff_files = list(
                set(hashes['deps'].keys()).symmetric_difference(
                    set(IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'].keys())))
            if diff_files:
                msg = f"[1] Need to process {cmd_entry.file} because different files:\n"
                for file in diff_files:
                    msg += f'{file}\n'
                debug_printer(msg)
            for file in IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'].keys():
                if file in hashes['deps'] and hashes['deps'][file] != IWYU_ANALYSIS_STATE[
                        cmd_entry.file]['hashes']['deps'][file]:
                    debug_printer(
                        f"[1] Need to process {cmd_entry.file} because hash changed:\n{file}: {hashes['deps'][file]}\n{file}: {IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'][file]}"
                    )

        if hashes['full_hash'] == IWYU_ANALYSIS_STATE[
                cmd_entry.file]['hashes']['full_hash'] and mode_success in IWYU_ANALYSIS_STATE[
                    cmd_entry.file].get('success', []):
            custom_printer(f"{Fore.YELLOW}[5] - No Change!  : {cmd_entry.file}{Fore.RESET}")
            return ResultType.NO_CHANGE

    return None


def calc_dep_headers(cmd_entry: CompileCommand) -> Optional[ResultType]:
    """
    The second step in the IWYU process.

    We need to get a list of headers which are dependencies so we can copy them to an isolated
    working directory (so parallel IWYU changes don't break us). We will switch on preprocessor
    for faster generation of the dep file.

    Once we have the deps list, we parse it and calc the hashes of the deps.
    """

    try:
        with tempfile.NamedTemporaryFile() as depfile:

            # first time we could be executing a real command so we make sure the dir
            # so the compiler is not mad
            outputs = shlex.split(cmd_entry.output)
            for output in outputs:
                out_dir = os.path.dirname(output)
                if out_dir:
                    os.makedirs(out_dir, exist_ok=True)

            # setup up command for fast depfile generation
            cmd = cmd_entry.command
            cmd += f' -MD -MF {depfile.name}'
            cmd = cmd.replace(' -c ', ' -E ')
            debug_printer(f"[1] - Getting Deps: {cmd_entry.file}")

            try:
                deps_proc = subprocess.run(cmd, shell=True, capture_output=True, text=True,
                                           timeout=300)
            except subprocess.TimeoutExpired:
                deps_proc = None
                pass

            # if successful, record the latest deps with there hashes, otherwise try again later
            if deps_proc is None or deps_proc.returncode != 0:
                printer(f"{Fore.RED}[5] - Deps Failed!: {cmd_entry.file}{Fore.RESET}")
                printer(deps_proc.stderr)
                return ResultType.RESUBMIT
            else:
                with open(depfile.name) as deps:
                    deps_str = deps.read()
                    deps_str = deps_str.replace('\\\n', '').strip()

                    hashes = recalc_hashes(shlex.split(deps_str)[1:])
                    if not IWYU_ANALYSIS_STATE.get(cmd_entry.file):
                        IWYU_ANALYSIS_STATE[cmd_entry.file] = asdict(cmd_entry)
                    IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes'] = hashes
                    IWYU_ANALYSIS_STATE[cmd_entry.file]['success'] = []

    # if the dep command failed the context will through an execption, we will ignore just
    # that case
    except FileNotFoundError as exc:
        traceback.print_exc()
        if depfile.name in str(exc):
            pass

    return None


def execute_iwyu(cmd_entry: CompileCommand, test_dir: str) -> Union[ResultType, bytes]:
    """
    The third step of IWYU analysis. Check mode will stop here.

    Here we want to execute IWYU on our source. Note at this point in fix mode
    we will be working out of an isolated test directory which has the
    required header tree copied over. Check mode will just pass in the original
    project root as the test_dir (the real source tree).
    """

    # assert we are working with a pure clang++ build
    if not cmd_entry.command.startswith(f'{TOOLCHAIN_DIR}/clang++'):
        printer("unexpected compiler:")
        printer(cmd_entry.command)
        return ResultType.FAILED

    # swap out for our tool and add in extra options for IWYU
    cmd = f'{TOOLCHAIN_DIR}/include-what-you-use' + cmd_entry.command[len(f'{TOOLCHAIN_DIR}/clang++'
                                                                          ):]
    cmd += ' ' + ' '.join(get_clang_includes())
    cmd += ' ' + ' '.join(IWYU_OPTIONS)

    # mimic the PATH we normally use in our build
    env = os.environ.copy()
    env['PATH'] += f':{TOOLCHAIN_DIR}'

    debug_printer(f'[2] - Running IWYU: {cmd_entry.file}')
    proc = subprocess.run(cmd, shell=True, env=env, capture_output=True, cwd=test_dir)

    # IWYU has some bugs about forward declares I am assuming, because in some cases even though
    # we have passed --no_fwd_decls it still sometimes recommend forward declares and sometimes they
    # are wrong and cause compilation errors.
    remove_fwd_declares = []
    for line in proc.stderr.decode('utf-8').split('\n'):
        line = line.strip()
        if not line.endswith(':') and not line.startswith(
            ('#include ', '-')) and ('class ' in line or 'struct ' in line):
            continue
        remove_fwd_declares.append(line)
    iwyu_output = '\n'.join(remove_fwd_declares)

    # IWYU has weird exit codes, where a >=2 is considered success:
    # https://github.com/include-what-you-use/include-what-you-use/blob/clang_12/iwyu_globals.h#L27-L34
    if command_line_args.check and proc.returncode != 2:
        printer(f"{Fore.RED}[2] - IWYU Failed: {cmd_entry.file}{Fore.RESET}")
        if proc.returncode < 2:
            printer(f"exited with error: {proc.returncode}")
        else:
            printer(f"changes required: {proc.returncode - 2}")
        printer(iwyu_output)
        return failed_return()
    elif proc.returncode < 2:
        printer(f'{Fore.RED}[2] - IWYU Failed : {cmd_entry.file}{Fore.RESET}')
        printer(cmd)
        printer(str(proc.returncode))
        printer(proc.stderr.decode('utf-8'))
        copy_error_state(cmd_entry, test_dir)
        return failed_return()

    # save the output for debug or inspection later
    with open(os.path.splitext(cmd_entry.output)[0] + '.iwyu', 'w') as iwyu_out:
        iwyu_out.write(iwyu_output)

    return iwyu_output.encode('utf-8')


def apply_fixes(cmd_entry: CompileCommand, iwyu_output: bytes,
                test_dir: str) -> Optional[ResultType]:
    """
    Step 4 in the IWYU process.

    We need to run the fix_includes script to apply the output from the IWYU binary.
    """
    cmd = [f'{sys.executable}', f'{TOOLCHAIN_DIR}/fix_includes.py'] + IWYU_FIX_OPTIONS

    debug_printer(f'[3] - Apply fixes : {cmd_entry.file}')
    try:
        subprocess.run(cmd, capture_output=True, input=iwyu_output, timeout=180, cwd=test_dir)
    except subprocess.TimeoutExpired:
        printer(f"{Fore.RED}[5] - Apply failed: {cmd_entry.file}{Fore.RESET}")
        return ResultType.RESUBMIT

    return None


def test_compile(cmd_entry: CompileCommand, test_dir: str) -> Optional[ResultType]:
    """
    Step 5 in the IWYU analysis and the last step for fix mode.

    We run the normal compile command in a test directory and make sure it is successful before
    it will be copied back into the real source tree for inclusion into other jobs.
    """

    try:
        with tempfile.NamedTemporaryFile() as depfile:
            debug_printer(f"[4] - Test compile: {cmd_entry.file}")

            # we want to capture the header deps again because IWYU may have changed them
            cmd = cmd_entry.command
            cmd += f' -MMD -MF {depfile.name}'
            try:
                p3 = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300,
                                    cwd=test_dir)
            except (subprocess.TimeoutExpired, MemoryError):
                p3 = None
                pass

            # our test compile has failed so we need to report and setup for debug
            if p3 is not None and p3.returncode != 0:
                printer(f"{Fore.RED}[5] - IWYU Failed!: {cmd_entry.file}{Fore.RESET}")
                printer(f"{cmd}")
                printer(f"{p3.stderr}")
                copy_error_state(cmd_entry, test_dir)
                return failed_return()

            else:
                with open(depfile.name) as deps:
                    # calculate the hashes of the deps used to create
                    # this successful compile.
                    deps_str = deps.read()
                    deps_str = deps_str.replace('\\\n', '').strip()
                    hashes = recalc_hashes(shlex.split(deps_str)[1:], change_dir=test_dir)

                    if result := check_for_cycles(cmd_entry, hashes, test_dir):
                        return result

                    IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes'] = hashes
                    if 'FIX' not in IWYU_ANALYSIS_STATE[cmd_entry.file]['success']:
                        IWYU_ANALYSIS_STATE[cmd_entry.file]['success'].append('FIX')
                    printer(f"{Fore.GREEN}[5] - IWYU Success: {cmd_entry.file}{Fore.RESET}")
                    return ResultType.SUCCESS

    # if we failed, the depfile may not have been generated, so check for it
    # ignore it
    except FileNotFoundError as exc:
        if depfile.name in str(exc):
            pass

    return None


def intialize_deps(cmd_entry: CompileCommand) -> Tuple[ResultType, CompileCommand]:
    """
    When running in fix mode, we take some time to initialize the header deps.

    This is mainly used to improve the overall time to complete full analysis. We want process
    the source files in order of files with least dependencies to most dependencies. The rational
    is that if it has a lot of dependencies we should do last so any changes in those dependencies
    are automatically accounted for and the change of need to do rework is lessened. Also the
    progress bar can be more accurate and not count skip files.
    """

    # step 1
    if result := need_to_process(cmd_entry, custom_printer=debug_printer):
        return result, cmd_entry

    # if we have deps from a previous that should be a good enough indicator
    # of how dependency heavy it is, and its worth just taking that over
    # needing to invoke the compiler.
    try:
        if len(IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps']):
            return ResultType.SUCCESS, cmd_entry

    except KeyError:
        pass

    if result := calc_dep_headers(cmd_entry):
        return result, cmd_entry

    return ResultType.SUCCESS, cmd_entry


def check_iwyu(cmd_entry: CompileCommand) -> ResultType:
    """
    One of the two thread functions the main thread pool executor will call.

    Here we execute up to step 3 (steps at the top comment) and report success
    if IWYU reports no required changes.
    """

    # step 1
    if result := need_to_process(cmd_entry):
        return result

    # step 2
    if result := calc_dep_headers(cmd_entry):
        return result

    # step 3
    iwyu_out = execute_iwyu(cmd_entry, '.')
    if isinstance(iwyu_out, ResultType):
        return iwyu_out

    # success!
    printer(f"{Fore.GREEN}[2] - IWYU Success: {cmd_entry.file}{Fore.RESET}")
    if "CHECK" not in IWYU_ANALYSIS_STATE[cmd_entry.file]['success']:
        IWYU_ANALYSIS_STATE[cmd_entry.file]['success'].append('CHECK')
    return ResultType.SUCCESS


def fix_iwyu(cmd_entry: CompileCommand) -> ResultType:
    """
    One of the two thread functions the main thread pool executor will call.

    Here we execute up to step 5 (steps at the top comment) and report success
    if we are able to successfully compile the original command after IWYU
    has made its changes.
    """

    # step 1
    if result := need_to_process(cmd_entry):
        return result

    # step 2
    if result := calc_dep_headers(cmd_entry):
        return result

    with tempfile.TemporaryDirectory() as test_dir:

        # the changes will be done in an isolated test dir so not to conflict with
        # other concurrent processes.
        test_source_files = setup_test_dir(cmd_entry, test_dir)

        # a first round of pragmas to make sure IWYU doesn't fail or remove things we dont want
        add_pragmas(test_source_files)

        # step 3
        iwyu_out = execute_iwyu(cmd_entry, test_dir)
        if isinstance(iwyu_out, ResultType):
            return iwyu_out

        # now we can extract exactly what files IWYU operated on and copy only those back
        changed_files = [
            os.path.join(test_dir, file)
            for file in re.findall(CHANGED_FILES_REGEX, iwyu_out.decode('utf-8'))
            if in_project_root(file)
        ]
        test_source_files += [file for file in changed_files if file not in test_source_files]

        # step 4
        if result := apply_fixes(cmd_entry, iwyu_out, test_dir):
            return result

        # a final round of pragmas for the next time this is run through IWYU
        add_pragmas(test_source_files)

        # step 5
        result = test_compile(cmd_entry, test_dir)
        if result == ResultType.SUCCESS:
            for file in test_source_files:
                if os.path.exists(file):
                    shutil.move(file, file[len(test_dir) + 1:])

        return result


def run_iwyu(cmd_entry: CompileCommand) -> Tuple[ResultType, CompileCommand]:
    """Intermediate function which delegates the underlying mode to run."""

    if command_line_args.check:
        return check_iwyu(cmd_entry), cmd_entry
    else:
        return fix_iwyu(cmd_entry), cmd_entry


def main() -> None:
    """Main function."""
    global IWYU_ANALYSIS_STATE, SHUTDOWN_FLAG  # pylint: disable=global-statement
    atexit.register(write_iwyu_data)

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=len(os.sched_getaffinity(0)) + 4) as executor:

        # ctrl+c tru to shutdown as fast as possible.
        def sigint_handler(the_signal, frame):
            executor.shutdown(wait=False, cancel_futures=True)
            sys.exit(1)

        signal.signal(signal.SIGINT, sigint_handler)

        # load in any data from prior runs
        if os.path.exists(command_line_args.iwyu_data):
            with open(command_line_args.iwyu_data) as iwyu_data_file:
                IWYU_ANALYSIS_STATE = json.load(iwyu_data_file)

        # load in the compile commands
        with open(command_line_args.compile_commands) as compdb_file:
            compiledb = [CompileCommand(**json_data) for json_data in json.load(compdb_file)]

            # assert the generated source code has been generated
            for cmd_entry in compiledb:
                if cmd_entry.file.endswith('_gen.cpp') and not os.path.exists(cmd_entry.file):
                    printer(f"{Fore.RED}[5] - Missing Gen!: {cmd_entry.file}{Fore.RESET}")
                    printer(
                        f"Error: missing generated file {cmd_entry.file}, make sure generated-sources are generated."
                    )
                    sys.exit(1)

            total_cmds = len(compiledb)
            start_index = int(total_cmds * command_line_args.start_ratio)
            if start_index < 0:
                start_index = 0
            if start_index > total_cmds:
                start_index = total_cmds

            end_index = int(total_cmds * command_line_args.end_ratio)
            if end_index < 0:
                end_index = 0
            if end_index > total_cmds:
                end_index = total_cmds

            if start_index == end_index:
                print(f"Error: start_index and end_index are the same: {start_index}")
                sys.exit(1)
            if start_index > end_index:
                print(
                    f"Error: start_index {start_index} can not be greater than end_index {end_index}"
                )
                sys.exit(1)

            print(f"Analyzing compile commands from {start_index} to {end_index}.")
            compiledb = compiledb[start_index:end_index]
            if not command_line_args.check:
                # We can optimize the order we process things by processing source files
                # with the least number of dependencies first. This is a cost up front
                # but will result in huge gains in the amount of re-processing to be done.
                printer("Getting Initial Header Dependencies...")
                cmd_entry_list = []
                try:
                    with tqdm(total=len(compiledb), disable=None) as pbar:

                        # create and run the dependency check jobs
                        future_cmd = {
                            executor.submit(intialize_deps, cmd_entry): cmd_entry
                            for cmd_entry in compiledb
                        }
                        for future in concurrent.futures.as_completed(future_cmd):
                            result, cmd_entry = future.result()
                            if result != ResultType.NOT_RUNNING:
                                cmd_entry_list.append(cmd_entry)
                            pbar.update(1)
                except Exception:
                    SHUTDOWN_FLAG = True
                    traceback.print_exc()
                    executor.shutdown(wait=True, cancel_futures=True)
                    sys.exit(1)
            else:
                cmd_entry_list = compiledb

            try:

                # this loop will keep looping until a full run produce no new changes.
                changes_left = True
                while changes_left:
                    changes_left = False

                    with tqdm(total=len(cmd_entry_list), disable=None) as pbar:

                        # create and run the IWYU jobs
                        def dep_sorted(cmd_entry):
                            try:
                                return len(IWYU_ANALYSIS_STATE[cmd_entry.file]['hashes']['deps'])
                            except KeyError:
                                return 0

                        future_cmd = {
                            executor.submit(run_iwyu, cmd_entry): cmd_entry
                            for cmd_entry in sorted(cmd_entry_list, key=dep_sorted)
                        }

                        # process the results
                        for future in concurrent.futures.as_completed(future_cmd):
                            result, cmd_entry = future.result()

                            # any result which implies there could be changes required sets the
                            # next loop
                            if result not in (ResultType.NO_CHANGE, ResultType.NOT_RUNNING):
                                changes_left = True

                            # if a file is considered done for this loop, update the status bar
                            if result in [
                                    ResultType.SUCCESS, ResultType.NO_CHANGE, ResultType.NOT_RUNNING
                            ]:
                                pbar.update(1)
                            # resubmit jobs which may have a better change to run later
                            elif result == ResultType.RESUBMIT:
                                executor.submit(run_iwyu, cmd_entry)
                            # handle a failure case, excpetion quickly drops us out of this loop.
                            else:
                                SHUTDOWN_FLAG = True
                                tqdm.write(
                                    f"{result.name}: Shutting down other threads, please be patient."
                                )
                                raise Exception(
                                    f'Shutdown due to {result.name} {cmd_entry["file"]}')

            except Exception:
                SHUTDOWN_FLAG = True
                traceback.print_exc()
                executor.shutdown(wait=True, cancel_futures=True)
                sys.exit(1)
            finally:
                if CYCLE_FILES:
                    printer(f"{Fore.YELLOW} Cycles detected:")
                    for file in CYCLE_FILES:
                        printer(f'    {file}')


main()