Skip to content

bb_unpacker.py

ofrak_ghidra.components.blocks.bb_unpacker

GhidraBasicBlockUnpacker (BasicBlockUnpacker, OfrakGhidraMixin)

unpack(self, resource, config=None) async

Unpack a basic block into its corresponding instructions.

Source code in ofrak_ghidra/components/blocks/bb_unpacker.py
async def unpack(self, resource: Resource, config=None):
    bb_view: BasicBlock = await resource.view_as(BasicBlock)
    bb_start_vaddr = bb_view.virtual_address
    instructions = await self.batch_manager.get_result(
        (
            resource,
            bb_start_vaddr,
            bb_start_vaddr + bb_view.size - 1,  # Ghidra is inclusive
        ),
    )
    program_attrs = await resource.analyze(ProgramAttributes)

    children_created = []
    for instruction in instructions:
        vaddr = instruction["instr_offset"]
        size = instruction["instr_size"]
        mnem, operands = _asm_fixups(
            instruction["mnem"].lower(), instruction["operands"].lower(), program_attrs
        )
        results = instruction["results"].split(",")
        regs_read = list()
        regs_written = list()
        # TODO A way to standardize register representations
        if all(item in results for item in ["CF", "PF", "ZF", "SF", "OF"]):
            regs_written.append("rflags")
        if all(item in results for item in ["RSP"]):
            regs_written.append("rsp")
            regs_read.append("rsp")

        for reg in instruction["regs_read"].lower().split(","):
            if reg not in regs_read and reg != "":
                regs_read.append(reg)
        for reg in instruction["regs_written"].lower().split(","):
            if reg not in regs_written and reg != "":
                regs_written.append(reg)
        disasm = f"{mnem} {operands}"

        mode_string = instruction.get("instr_mode", "NONE")
        mode = InstructionSetMode[mode_string]
        assert mode == bb_view.mode, (
            f"The instruction mode {mode.name} returned by Ghidra does not match the basic "
            f"block mode {bb_view.mode.name}."
        )

        instruction = Instruction(
            virtual_address=vaddr,
            size=size,
            disassembly=disasm,
            mnemonic=mnem,
            operands=operands,
            mode=mode,
        )
        children_created.append(
            bb_view.create_child_region(instruction, additional_attributes=(program_attrs,))
        )
    await asyncio.gather(*children_created)

_asm_fixups(base_mnemonic, base_operands, program_attrs) private

Fix up an assembly instruction from Ghidra, so that the toolchain can assemble it.

Parameters:

Name Type Description Default
base_mnemonic str

original mnemonic from Ghidra

required
base_operands str

original operands from Ghidra

required
program_attrs ProgramAttributes

ProgramAttributes for the binary analyzed in Ghidra

required

Returns:

Type Description
Tuple[str, str]

fixed up assembly instruction

Source code in ofrak_ghidra/components/blocks/bb_unpacker.py
def _asm_fixups(
    base_mnemonic: str, base_operands: str, program_attrs: ProgramAttributes
) -> Tuple[str, str]:
    """
    Fix up an assembly instruction from Ghidra, so that the toolchain can assemble it.

    :param base_mnemonic: original mnemonic from Ghidra
    :param base_operands: original operands from Ghidra
    :param program_attrs: ProgramAttributes for the binary analyzed in Ghidra

    :return: fixed up assembly instruction
    """
    operands = base_operands.replace(",", ", ")
    operands = operands.replace("+ -", "- ")
    operands = re.sub(RE_STRIP_PRECEDING_ZERO, r"0x\1", operands)
    if program_attrs.isa is InstructionSet.ARM:
        # Convert the CPY Ghidra instruction to the more commonly used MOV instruction
        mnemonic = re.sub(RE_CPY_TO_MOV, "mov", base_mnemonic)
    elif program_attrs.isa is InstructionSet.M68K:
        # Convert the Ghidra assembly syntax (that corresponds to the manual's syntax) to AT&T syntax that the GNU toolchain uses
        mnemonic = base_mnemonic
        operands = operands.replace("sp", "%SP")
        operands = operands.replace("sr", "%SR")
        operands = operands.replace(" 0x", " #0x")
        for mnem in [
            "moveq",
            "mov3q",
            "subq",
            "cmpi",
            "addq",
            "cmpi",
            "addi",
            "ori",
            "subi",
            "stop",
        ]:
            if mnem in mnemonic:
                operands = re.sub(r"^0x", r"#0x", operands)

        operand_list = re.split("(,)", operands)
        operands = ""
        for operand in operand_list:
            if not "0x" in operand:
                operand = re.sub(r"a([0-7])", r"%A\1", operand)
                operand = re.sub(r"d([0-7])[bw]?", r"%D\1", operand)
            operands += operand
    else:
        mnemonic = base_mnemonic
    return mnemonic, operands