# Copyright 2019 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Parses ELF information without relying external tools. This file was originally copied and adapted from: https://fuchsia.googlesource.com/fuchsia/+/827f9fe/build/images/elfinfo.py """ from contextlib import contextmanager from collections import namedtuple import mmap import os import struct import uuid # Standard ELF constants. ELFMAG = '\x7fELF' EI_CLASS = 4 ELFCLASS32 = 1 ELFCLASS64 = 2 EI_DATA = 5 ELFDATA2LSB = 1 ELFDATA2MSB = 2 EM_386 = 3 EM_ARM = 40 EM_X86_64 = 62 EM_AARCH64 = 183 PT_LOAD = 1 PT_DYNAMIC = 2 PT_INTERP = 3 PT_NOTE = 4 DT_NEEDED = 1 DT_STRTAB = 5 DT_SONAME = 14 NT_GNU_BUILD_ID = 3 SHT_SYMTAB = 2 class elf_note(namedtuple('elf_note', [ 'name', 'type', 'desc', ])): # An ELF note is identified by (name_string, type_integer). def ident(self): return (self.name, self.type) def is_build_id(self): return self.ident() == ('GNU\0', NT_GNU_BUILD_ID) def build_id_hex(self): if self.is_build_id(): return ''.join(('%02x' % ord(byte)) for byte in self.desc) return None def __repr__(self): return ('elf_note(%r, %#x, <%d bytes>)' % (self.name, self.type, len(self.desc))) def gen_elf(): # { 'Struct1': (ELFCLASS32 fields, ELFCLASS64 fields), # 'Struct2': fields_same_for_both, ... } elf_types = { 'Ehdr': ([ ('e_ident', '16s'), ('e_type', 'H'), ('e_machine', 'H'), ('e_version', 'I'), ('e_entry', 'I'), ('e_phoff', 'I'), ('e_shoff', 'I'), ('e_flags', 'I'), ('e_ehsize', 'H'), ('e_phentsize', 'H'), ('e_phnum', 'H'), ('e_shentsize', 'H'), ('e_shnum', 'H'), ('e_shstrndx', 'H'), ], [ ('e_ident', '16s'), ('e_type', 'H'), ('e_machine', 'H'), ('e_version', 'I'), ('e_entry', 'Q'), ('e_phoff', 'Q'), ('e_shoff', 'Q'), ('e_flags', 'I'), ('e_ehsize', 'H'), ('e_phentsize', 'H'), ('e_phnum', 'H'), ('e_shentsize', 'H'), ('e_shnum', 'H'), ('e_shstrndx', 'H'), ]), 'Phdr': ([ ('p_type', 'I'), ('p_offset', 'I'), ('p_vaddr', 'I'), ('p_paddr', 'I'), ('p_filesz', 'I'), ('p_memsz', 'I'), ('p_flags', 'I'), ('p_align', 'I'), ], [ ('p_type', 'I'), ('p_flags', 'I'), ('p_offset', 'Q'), ('p_vaddr', 'Q'), ('p_paddr', 'Q'), ('p_filesz', 'Q'), ('p_memsz', 'Q'), ('p_align', 'Q'), ]), 'Shdr': ([ ('sh_name', 'L'), ('sh_type', 'L'), ('sh_flags', 'L'), ('sh_addr', 'L'), ('sh_offset', 'L'), ('sh_size', 'L'), ('sh_link', 'L'), ('sh_info', 'L'), ('sh_addralign', 'L'), ('sh_entsize', 'L'), ], [ ('sh_name', 'L'), ('sh_type', 'L'), ('sh_flags', 'Q'), ('sh_addr', 'Q'), ('sh_offset', 'Q'), ('sh_size', 'Q'), ('sh_link', 'L'), ('sh_info', 'L'), ('sh_addralign', 'Q'), ('sh_entsize', 'Q'), ]), 'Dyn': ([ ('d_tag', 'i'), ('d_val', 'I'), ], [ ('d_tag', 'q'), ('d_val', 'Q'), ]), 'Nhdr': [ ('n_namesz', 'I'), ('n_descsz', 'I'), ('n_type', 'I'), ], 'dwarf2_line_header': [ ('unit_length', 'L'), ('version', 'H'), ('header_length', 'L'), ('minimum_instruction_length', 'B'), ('default_is_stmt', 'B'), ('line_base', 'b'), ('line_range', 'B'), ('opcode_base', 'B'), ], 'dwarf4_line_header': [ ('unit_length', 'L'), ('version', 'H'), ('header_length', 'L'), ('minimum_instruction_length', 'B'), ('maximum_operations_per_instruction', 'B'), ('default_is_stmt', 'B'), ('line_base', 'b'), ('line_range', 'b'), ('opcode_base', 'B'), ], } # There is an accessor for each struct, e.g. Ehdr. # Ehdr.read is a function like Struct.unpack_from. # Ehdr.size is the size of the struct. elf_accessor = namedtuple('elf_accessor', ['size', 'read', 'write', 'pack']) # All the accessors for a format (class, byte-order) form one elf, # e.g. use elf.Ehdr and elf.Phdr. elf = namedtuple('elf', elf_types.keys()) def gen_accessors(is64, struct_byte_order): def make_accessor(type, decoder): return elf_accessor( size=decoder.size, read=lambda buffer, offset=0: type._make( decoder.unpack_from(buffer, offset)), write=lambda buffer, offset, x: decoder.pack_into( buffer, offset, *x), pack=lambda x: decoder.pack(*x)) for name, fields in elf_types.iteritems(): if isinstance(fields, tuple): fields = fields[1 if is64 else 0] type = namedtuple(name, [field_name for field_name, fmt in fields]) decoder = struct.Struct(struct_byte_order + ''.join( fmt for field_name, fmt in fields)) yield make_accessor(type, decoder) for elfclass, is64 in [(ELFCLASS32, False), (ELFCLASS64, True)]: for elf_bo, struct_bo in [(ELFDATA2LSB, '<'), (ELFDATA2MSB, '>')]: yield ((chr(elfclass), chr(elf_bo)), elf(*gen_accessors(is64, struct_bo))) # e.g. ELF[file[EI_CLASS], file[EI_DATA]].Ehdr.read(file).e_phnum ELF = dict(gen_elf()) def get_elf_accessor(file): # If it looks like an ELF file, whip out the decoder ring. if file[:len(ELFMAG)] == ELFMAG: return ELF[file[EI_CLASS], file[EI_DATA]] return None def gen_phdrs(file, elf, ehdr): for pos in xrange(0, ehdr.e_phnum * elf.Phdr.size, elf.Phdr.size): yield elf.Phdr.read(file, ehdr.e_phoff + pos) def gen_shdrs(file, elf, ehdr): for pos in xrange(0, ehdr.e_shnum * elf.Shdr.size, elf.Shdr.size): yield elf.Shdr.read(file, ehdr.e_shoff + pos) cpu = namedtuple( 'cpu', [ 'e_machine', # ELF e_machine int 'llvm', # LLVM triple CPU component 'gn', # GN target_cpu ]) ELF_MACHINE_TO_CPU = { elf: cpu(elf, llvm, gn) for elf, llvm, gn in [ (EM_386, 'i386', 'x86'), (EM_ARM, 'arm', 'arm'), (EM_X86_64, 'x86_64', 'x64'), (EM_AARCH64, 'aarch64', 'arm64'), ] } @contextmanager def mmapper(filename): """A context manager that yields (fd, file_contents) given a file name. This ensures that the mmap and file objects are closed at the end of the 'with' statement.""" fileobj = open(filename, 'rb') fd = fileobj.fileno() if os.fstat(fd).st_size == 0: # mmap can't handle empty files. try: yield fd, '' finally: fileobj.close() else: mmapobj = mmap.mmap(fd, 0, access=mmap.ACCESS_READ) try: yield fd, mmapobj finally: mmapobj.close() fileobj.close() elf_info = namedtuple( 'elf_info', [ 'filename', 'cpu', # cpu tuple 'notes', # list of (ident, desc): selected notes 'build_id', # string: lowercase hex 'stripped', # bool: Has no symbols or .debug_* sections 'interp', # string or None: PT_INTERP (without \0) 'soname', # string or None: DT_SONAME 'needed', # list of strings: DT_NEEDED ]) def get_elf_info(filename, match_notes=False): file = None elf = None ehdr = None phdrs = None # Yields an elf_note for each note in any PT_NOTE segment. def gen_notes(): def round_up_to(size): return ((size + 3) / 4) * 4 for phdr in phdrs: if phdr.p_type == PT_NOTE: pos = phdr.p_offset while pos < phdr.p_offset + phdr.p_filesz: nhdr = elf.Nhdr.read(file, pos) pos += elf.Nhdr.size name = file[pos:pos + nhdr.n_namesz] pos += round_up_to(nhdr.n_namesz) desc = file[pos:pos + nhdr.n_descsz] pos += round_up_to(nhdr.n_descsz) yield elf_note(name, nhdr.n_type, desc) def gen_sections(): shdrs = list(gen_shdrs(file, elf, ehdr)) if not shdrs: return strtab_shdr = shdrs[ehdr.e_shstrndx] for shdr, i in zip(shdrs, xrange(len(shdrs))): if i == 0: continue assert shdr.sh_name < strtab_shdr.sh_size, ( "%s: invalid sh_name" % filename) yield (shdr, extract_C_string(strtab_shdr.sh_offset + shdr.sh_name)) # Generates '\0'-terminated strings starting at the given offset, # until an empty string. def gen_strings(start): while True: end = file.find('\0', start) assert end >= start, ( "%s: Unterminated string at %#x" % (filename, start)) if start == end: break yield file[start:end] start = end + 1 def extract_C_string(start): for string in gen_strings(start): return string return '' # Returns a string of hex digits (or None). def get_build_id(): build_id = None for note in gen_notes(): # Note that the last build_id note needs to be used due to TO-442. possible_build_id = note.build_id_hex() if possible_build_id: build_id = possible_build_id return build_id # Returns a list of elf_note objects. def get_matching_notes(): if isinstance(match_notes, bool): if match_notes: return list(gen_notes()) else: return [] # If not a bool, it's an iterable of ident pairs. return [note for note in gen_notes() if note.ident() in match_notes] # Returns a string (without trailing '\0'), or None. def get_interp(): # PT_INTERP points directly to a string in the file. for interp in (phdr for phdr in phdrs if phdr.p_type == PT_INTERP): interp = file[interp.p_offset:interp.p_offset + interp.p_filesz] if interp[-1:] == '\0': interp = interp[:-1] return interp return None # Returns a set of strings. def get_soname_and_needed(): # Each DT_NEEDED or DT_SONAME points to a string in the .dynstr table. def GenDTStrings(tag): return (extract_C_string(strtab_offset + dt.d_val) for dt in dyn if dt.d_tag == tag) # PT_DYNAMIC points to the list of ElfNN_Dyn tags. for dynamic in (phdr for phdr in phdrs if phdr.p_type == PT_DYNAMIC): dyn = [ elf.Dyn.read(file, dynamic.p_offset + dyn_offset) for dyn_offset in xrange(0, dynamic.p_filesz, elf.Dyn.size) ] # DT_STRTAB points to the string table's vaddr (.dynstr). [strtab_vaddr] = [dt.d_val for dt in dyn if dt.d_tag == DT_STRTAB] # Find the PT_LOAD containing the vaddr to compute the file offset. [strtab_offset] = [ strtab_vaddr - phdr.p_vaddr + phdr.p_offset for phdr in phdrs if (phdr.p_type == PT_LOAD and phdr.p_vaddr <= strtab_vaddr and strtab_vaddr - phdr.p_vaddr < phdr.p_filesz) ] soname = None for soname in GenDTStrings(DT_SONAME): break return soname, set(GenDTStrings(DT_NEEDED)) return None, set() def get_stripped(): return all(shdr.sh_type != SHT_SYMTAB and not name.startswith('.debug_') for shdr, name in gen_sections()) def get_cpu(): return ELF_MACHINE_TO_CPU.get(ehdr.e_machine) # Map in the whole file's contents and use it as a string. with mmapper(filename) as mapped: fd, file = mapped elf = get_elf_accessor(file) if elf is not None: # ELF header leads to program headers. ehdr = elf.Ehdr.read(file) assert ehdr.e_phentsize == elf.Phdr.size, ( "%s: invalid e_phentsize" % filename) phdrs = list(gen_phdrs(file, elf, ehdr)) return elf_info(filename, get_cpu(), get_matching_notes(), get_build_id(), get_stripped(), get_interp(), *get_soname_and_needed()) return None # Module public API. __all__ = ['cpu', 'elf_info', 'elf_note', 'get_elf_accessor', 'get_elf_info']