Skip to content

Commit

Permalink
Optimize and omit duplicate pattern matches (#66)
Browse files Browse the repository at this point in the history
  • Loading branch information
ViRb3 authored Oct 19, 2024
1 parent 54a6712 commit 9eb82e5
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 20 deletions.
70 changes: 55 additions & 15 deletions objfile/patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package objfile

import (
"errors"
"sort"
"strconv"
"strings"

Expand Down Expand Up @@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
func getOrSetRegion(regionMap map[int]map[int]bool, start, end int) bool {
if ends, ok := regionMap[start]; ok {
if ends[end] {
return true
} else {
ends[end] = true
return false
}
} else {
regionMap[start] = map[int]bool{end: true}
return false
}
}

func regionMapToSlices(regionMap map[int]map[int]bool) [][]int {
totalSize := 0
keys := make([]int, 0, len(regionMap))
for key, valueMap := range regionMap {
keys = append(keys, key)
totalSize += len(valueMap)
}
sort.Ints(keys)
result := make([][]int, 0, totalSize)
for _, key := range keys {
values := make([]int, 0, len(regionMap[key]))
for value := range regionMap[key] {
values = append(values, value)
}
sort.Ints(values)
for _, value := range values {
result = append(result, []int{key, value})
}
}
return result
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) [][]int {
data_len := len(data)
matches := make([]int, 0)
matchMap := make(map[int]map[int]bool)
cacheMap := make(map[int]map[int]bool)

// use an optimized memscan to find some candidates chunks from the much larger haystack
// use an optimized memscan to find all candidates chunks from the much larger haystack
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
for _, needleMatch := range needleMatches {
// adjust the window to the pattern start and end
Expand All @@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
data_end = data_len
}

// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, data_start, data_end) {
continue
}
// do the full regex scan on a very small chunk
for _, reMatch := range regexInfo.re.FindAllIndex(data[data_start:data_end], -1) {
// the match offset is the start index of the chunk + reMatch index
start := reMatch[0] + data_start
end := reMatch[1] + data_start
getOrSetRegion(matchMap, start, end)

//end := reMatch[1] + data_start
matches = append(matches, start)

// special case to handle sub-matches, which are skipped by regex but matched by YARA:
// AA AA BB CC
// { AA [0-1] BB CC }
// must produce:
// AA AA BB CC
// AA BB CC
// handle sub-matches, which are skipped by regex but matched by YARA
subStart := start + 1
for {
// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, subStart, data_end) {
break
}
subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1)
if len(subMatches) == 0 {
break
}

for _, match := range subMatches {
matches = append(matches, match[0]+subStart)
getOrSetRegion(matchMap, match[0]+subStart, match[1]+subStart)
}
subStart += subMatches[0][0] + 1
}
}
}
return matches

return regionMapToSlices(matchMap)
}

type RegexAndNeedle struct {
Expand Down
26 changes: 26 additions & 0 deletions objfile/patterns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package objfile

import (
"bytes"
"reflect"
"testing"

"rsc.io/binaryregexp"
Expand Down Expand Up @@ -265,4 +266,29 @@ func TestRegexpPatternFromYaraPattern(t *testing.T) {
t.Errorf("incorrect needle")
}
})

t.Run("Repeat", func(t *testing.T) {
reg, err := RegexpPatternFromYaraPattern("{ AA [0-512] BB }")

if err != nil {
t.Errorf("pattern errored")
}

if reg.len != 514 {
t.Errorf("incorrect pattern length")
}

if reg.needleOffset != 0 {
t.Errorf("incorrect needle offset")
}

if !bytes.Equal(reg.needle, []byte{0xAA}) {
t.Errorf("incorrect needle")
}

results := FindRegex([]byte{0xAA, 0xAA, 0xAA, 0xBB, 0xAA, 0xAA, 0xBB, 0xAA, 0xBB, 0xCC}, reg)
if !reflect.DeepEqual(results, [][]int{{0, 4}, {1, 4}, {2, 4}, {4, 7}, {5, 7}, {7, 9}}) {
t.Errorf("incorrect match indexes")
}
})
}
10 changes: 5 additions & 5 deletions objfile/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

// this is the pointer offset stored in the instruction
// 0x44E06A: 48 8D 0D 4F F0 24 00 lea rcx, off_69D0C0 (result: 0x24f04f)
Expand All @@ -119,7 +119,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x86reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

moduleDataPtr := uint64(binary.LittleEndian.Uint32(data[sigPtr+x86sig.moduleDataPtrLoc:][:4]))
matches = append(matches, SignatureMatch{
Expand All @@ -138,7 +138,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

adrp := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADRP:][:4])
add := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADD:][:4])
Expand Down Expand Up @@ -169,7 +169,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm32reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
ldr := binary.LittleEndian.Uint32(data[sigPtr+ARM32_sig.moduleDataPtrLDR:][:4])
// ARM PC relative is always +8 due to legacy nonsense
ldr_pointer_stub := uint64((ldr & 0x00000FFF) + 8)
Expand All @@ -190,7 +190,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, ppcBEreg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
moduleDataPtrHi := int64(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrHi:][:2]))
// addi takes a signed immediate
moduleDataPtrLo := int64(int16(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrLo:][:2])))
Expand Down

0 comments on commit 9eb82e5

Please sign in to comment.