summaryrefslogtreecommitdiff
path: root/buildscripts/large_file_check.py
blob: 7c5388f1f34f84f9951563a0180ba3e052340f98 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""Check files in git diff to ensure they are within a given size limit."""

# pylint: disable=wrong-import-position

import argparse
import fnmatch
import logging
import os
import pathlib
import sys
import textwrap

from typing import Any, Callable, Dict, List, Optional, Tuple

import structlog

from git import Repo

mongo_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.realpath(__file__))))
# Get relative imports to work when the package is not installed on the PYTHONPATH.
if __name__ == "__main__" and __package__ is None:
    sys.path.append(mongo_dir)

from buildscripts.linter import git
from buildscripts.patch_builds.change_data import (RevisionMap, find_changed_files_in_repos,
                                                   generate_revision_map)


# Console renderer for structured logging
def renderer(_logger: logging.Logger, _name: str, eventdict: Dict[Any, Any]) -> str:
    if 'files' in eventdict:
        return "{event}: {files}".format(**eventdict)
    if 'repo' in eventdict:
        return "{event}: {repo}".format(**eventdict)
    if 'file' in eventdict:
        if 'bytes' in eventdict:
            return "{event}: {file} {bytes} bytes".format(**eventdict)
        return "{event}: {file}".format(**eventdict)
    return "{event}".format(**eventdict)


# Configure the logger so it doesn't spam output on huge diffs
structlog.configure(
    logger_factory=structlog.stdlib.LoggerFactory(),
    wrapper_class=structlog.stdlib.BoundLogger,
    cache_logger_on_first_use=True,
    processors=[
        structlog.stdlib.filter_by_level,
        renderer,
    ],
)

LOGGER = structlog.get_logger(__name__)
MONGO_REVISION_ENV_VAR = "REVISION"
ENTERPRISE_REVISION_ENV_VAR = "ENTERPRISE_REV"


def _get_repos_and_revisions() -> Tuple[List[Repo], RevisionMap]:
    """Get the repo object and a map of revisions to compare against."""
    modules = git.get_module_paths()
    repos = [Repo(path) for path in modules]
    revision_map = generate_revision_map(
        repos, {
            "mongo": os.environ.get(MONGO_REVISION_ENV_VAR),
            "enterprise": os.environ.get(ENTERPRISE_REVISION_ENV_VAR)
        })
    return repos, revision_map


def git_changed_files(excludes: List[pathlib.Path]) -> List[pathlib.Path]:
    """
    Get the files that have changes since the last git commit.

    :param excludes: A list of files which should be excluded from changed file checks.
    :return: List of changed files.
    """
    repos, revision_map = _get_repos_and_revisions()
    LOGGER.debug("revisions", revision=revision_map)

    def _filter_fn(file_path: pathlib.Path) -> bool:
        if not file_path.exists():
            return False
        for exclude in excludes:
            if fnmatch.fnmatch(file_path, exclude):
                return False
        return True

    files = [
        filename
        for filename in list(map(pathlib.Path, find_changed_files_in_repos(repos, revision_map)))
        if _filter_fn(filename)
    ]

    LOGGER.debug("Found files to check", files=list(map(str, files)))
    return files


def diff_file_sizes(size_limit: int, excludes: Optional[List[str]] = None) -> List[pathlib.Path]:
    if excludes is None:
        excludes = []

    large_files: list[pathlib.Path] = []

    for file_path in git_changed_files(excludes):
        LOGGER.debug("Checking file size", file=str(file_path))
        file_size = file_path.stat().st_size
        if file_size > size_limit:
            LOGGER.error("File too large", file=str(file_path), bytes=file_size)
            large_files.append(file_path)

    return large_files


def main(*args: str) -> int:
    """Execute Main entry point."""

    parser = argparse.ArgumentParser(
        description='Git commit large file checker.', epilog=textwrap.dedent('''\
        NOTE: The --exclude argument is an exact match but can accept glob patterns. If * is used,
        it matches *all* characters, including path separators.
    '''))
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
    parser.add_argument("--exclude", help="Paths to exclude from check", nargs="+",
                        type=pathlib.Path, required=False)
    parser.add_argument("--size-mb", help="File size limit (MiB)", type=int, default="10")
    parsed_args = parser.parse_args(args[1:])

    if parsed_args.verbose:
        logging.basicConfig(level=logging.DEBUG)
        structlog.stdlib.filter_by_level(LOGGER, 'debug', {})
    else:
        logging.basicConfig(level=logging.INFO)
        structlog.stdlib.filter_by_level(LOGGER, 'info', {})

    large_files = diff_file_sizes(parsed_args.size_mb * 1024 * 1024, parsed_args.exclude)
    if len(large_files) == 0:
        LOGGER.info("All files passed size check")
        return 0

    LOGGER.error("Some files failed size check", files=list(map(str, large_files)))
    return 1


if __name__ == '__main__':
    sys.exit(main(*sys.argv))