Skip to content

Commit

Permalink
qs: re-enable structure hints
Browse files Browse the repository at this point in the history
closes #770
  • Loading branch information
williballenthin committed Jun 1, 2023
1 parent b3de4cd commit e2e36ed
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 85 deletions.
207 changes: 123 additions & 84 deletions floss/qs/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import pefile
import colorama
import lancelot
import intervaltree
import rich.traceback
from rich.text import Text
from rich.style import Style
Expand Down Expand Up @@ -196,7 +195,7 @@ def Span(text: str, style: Style = DEFAULT_STYLE) -> Text:

PADDING_WIDTH = 2
OFFSET_WIDTH = 8
STRUCTURE_WIDTH = 16
STRUCTURE_WIDTH = 20


def render_string_padding():
Expand Down Expand Up @@ -291,8 +290,9 @@ def render_string_structure(s: TaggedString):
ret = Text()

if s.structure:
structure = Span("/" + s.structure, style=MUTED_STYLE)
structure.align("left", STRUCTURE_WIDTH)
structure = Span(s.structure, style=Style(color="blue"))
structure.align("left", STRUCTURE_WIDTH - 1)
ret.append(Span("/", style=MUTED_STYLE))
ret.append(structure)
else:
ret.append_text(Span(" " * STRUCTURE_WIDTH))
Expand Down Expand Up @@ -528,6 +528,74 @@ def global_prevalence_hash_database_yaa_tagger(s: ExtractedString) -> Sequence[T
return ret


@dataclass
class Structure:
slice: Slice
name: str


def collect_pe_structures(slice: Slice, pe: pefile.PE) -> Sequence[Structure]:
structures = []

for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
offset = section.get_file_offset()
size = section.sizeof()

structures.append(
Structure(
slice=slice.slice(offset, size),
name="section header",
)
)

if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
for dll in pe.DIRECTORY_ENTRY_IMPORT:
try:
dll_name = dll.dll.decode("ascii")
except UnicodeDecodeError:
continue

rva = dll.struct.Name
size = len(dll_name)
offset = pe.get_offset_from_rva(rva)

structures.append(
Structure(
slice=slice.slice(offset, size),
name="import table",
)
)

for entry in dll.imports:
if entry.name is None:
continue

if entry.name_offset is None:
continue

try:
symbol_name = entry.name.decode("ascii")
except UnicodeDecodeError:
continue

offset = entry.name_offset
size = len(symbol_name)

structures.append(
Structure(
slice=slice.slice(offset, size),
name="import table",
)
)

# TODO: other structures
# export table
# certificate data
# rich header

return structures


@dataclass
class Layout(abc.ABC):
"""
Expand All @@ -551,6 +619,7 @@ class Layout(abc.ABC):
such as a PE file, a section, a segment, or a resource.
subclasses can provide more specific behavior when it comes to tagging strings.
"""

slice: Slice

# human readable name
Expand Down Expand Up @@ -645,6 +714,37 @@ def tag_strings(self, taggers: Sequence[Tagger]):
for child in self.children:
child.tag_strings(taggers)

def mark_structures(self, structures: Optional[Tuple[Dict[int, Structure], ...]] = (), **kwargs):
"""
mark the structures that might be associated with each string, recursively.
this means that the TaggedStrings may now have a non-empty .structure field.
this can be overridden, if a subclass has a way of parsing structures,
such as a PE file and all its data.
"""
if structures:
for string in self.strings:
for structures_by_address in structures:
structure = structures_by_address.get(string.offset)
if structure:
string.structure = structure.name
break

for child in self.children:
child.mark_structures(structures=structures, **kwargs)


@dataclass
class SectionLayout(Layout):
section: pefile.SectionStructure


@dataclass
class SegmentLayout(Layout):
"""region not covered by any section, such as PE header or overlay"""

pass


@dataclass
class PELayout(Layout):
Expand All @@ -654,6 +754,8 @@ class PELayout(Layout):
# file offsets of bytes that are recognized as code
code_offsets: Set[int]

structures_by_address: Dict[int, Structure]

def tag_strings(self, taggers: Sequence[Tagger]):
def check_is_reloc_tagger(s: ExtractedString) -> Sequence[Tag]:
return check_is_reloc(self.reloc_offsets, s)
Expand All @@ -668,88 +770,23 @@ def check_is_code_tagger(s: ExtractedString) -> Sequence[Tag]:

super().tag_strings(taggers)


@dataclass
class SectionLayout(Layout):
section: pefile.SectionStructure


@dataclass
class SegmentLayout(Layout):
"""region not covered by any section, such as PE header or overlay"""
pass
def mark_structures(self, structures=(), **kwargs):
for child in self.children:
if isinstance(child, (SectionLayout, SegmentLayout)):
# expected child of a PE
child.mark_structures(structures=structures + (self.structures_by_address,), **kwargs)
else:
# unexpected child of a PE
# maybe like a resource or overlay, etc.
# which is fine - but we don't expect it to know about the PE structures.
child.mark_structures(structures=structures, **kwargs)


@dataclass
class ResourceLayout(Layout):
pass


@dataclass
class Structure:
slice: Slice
name: str


def compute_file_structures(slice: Slice, pe: pefile.PE) -> Sequence[Structure]:
structures = []

for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
offset = section.get_file_offset()
size = section.sizeof()

structures.append(
Structure(
slice=slice.slice(offset, size),
name="section header",
)
)

if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
for dll in pe.DIRECTORY_ENTRY_IMPORT:
try:
dll_name = dll.dll.decode("ascii")
except UnicodeDecodeError:
continue

rva = dll.struct.Name
size = len(dll_name)
offset = pe.get_offset_from_rva(rva)

structures.append(
Structure(
slice=slice.slice(offset, size),
name="import table",
)
)

for entry in dll.imports:
if entry.name is None:
continue

if entry.name_offset is None:
continue

try:
symbol_name = entry.name.decode("ascii")
except UnicodeDecodeError:
continue

offset = entry.name_offset
size = len(symbol_name)

structures.append(
Structure(
slice=slice.slice(offset, size),
name="import table",
)
)

# TODO: other structures

return structures


def compute_pe_layout(slice: Slice) -> Layout:
data = slice.data

Expand All @@ -758,12 +795,13 @@ def compute_pe_layout(slice: Slice) -> Layout:
except pefile.PEFormatError as e:
raise ValueError("pefile failed to load workspace") from e

structures = compute_file_structures(slice, pe)
structures = collect_pe_structures(slice, pe)
reloc_offsets = get_reloc_offsets(slice, pe)

structures_by_range = intervaltree.IntervalTree()
for interval in structures:
structures_by_range.addi(interval.slice.range.offset, interval.slice.range.end, interval)
structures_by_address = {}
for structure in structures:
for offset in structure.slice.range:
structures_by_address[offset] = structure

# lancelot only accepts bytes, not mmap
with timing("lancelot: load workspace"):
Expand Down Expand Up @@ -792,6 +830,7 @@ def compute_pe_layout(slice: Slice) -> Layout:
name="pe",
reloc_offsets=reloc_offsets,
code_offsets=code_offsets,
structures_by_address=structures_by_address,
)

for section in pe.sections:
Expand Down Expand Up @@ -1168,7 +1207,7 @@ def main():
taggers = load_databases()
layout.tag_strings(taggers)

# TODO: figure out how to mark structures
layout.mark_structures()

# remove tags from libraries that have too few matches (five, by default).
remove_false_positive_lib_strings(layout)
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
extras_require={
"qs": [
"colorama==0.4.6",
"intervaltree==3.1.0",
"python-lancelot==0.8.6",
],
"dev": [
Expand Down

0 comments on commit e2e36ed

Please sign in to comment.