Skip to content

Commit

Permalink
Implement Regex Needle Optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
stevemk14ebr committed Aug 14, 2023
1 parent ebc220a commit 11d153d
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 192 deletions.
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/mandiant/GoReSym

go 1.20
go 1.21

require (
github.com/elliotchance/orderedmap v1.4.0
Expand All @@ -12,4 +12,5 @@ require (
require (
github.com/felixge/fgprof v0.9.3 // indirect
github.com/google/pprof v0.0.0-20230728192033-2ba5b33183c6 // indirect
golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb
)
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PK
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff h1:XmKBi9R6duxOB3lfc72wyrwiOY7X2Jl1wuI+RFOyMDE=
golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb h1:mIKbk8weKhSeLH2GmUTrvx8CjkyJmnU1wFmg59CUjFA=
golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Expand Down
176 changes: 70 additions & 106 deletions objfile/patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"strconv"
"strings"

"golang.org/x/exp/slices"
"rsc.io/binaryregexp"
)

Expand Down Expand Up @@ -42,14 +43,14 @@ func isHex(s string) bool {
// although this requires more code, we provide this functionality
// because these patterns are *much* more readable than raw regular expressions,
// we strongly value people being able to understand GoReSym's algorithm.
func RegexpPatternFromYaraPattern(pattern string) (string, error) {
func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {

if !strings.HasPrefix(pattern, "{") {
return "", errors.New("missing prefix")
return nil, errors.New("missing prefix")
}

if !strings.HasSuffix(pattern, "}") {
return "", errors.New("missing suffix")
return nil, errors.New("missing suffix")
}

pattern = strings.Trim(pattern, "{}")
Expand All @@ -58,6 +59,10 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {

pattern = strings.ToLower(pattern)

patLen := 0
needle := make([]byte, 0)
tmpNeedle := make([]byte, 0)

var regex_pattern string
for i := 0; i < len(pattern); {
// at the start of this loop,
Expand All @@ -71,12 +76,19 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
// output: .
if c == "?" {
if d != "?" {
return "", errors.New("cannot mask the first nibble")
return nil, errors.New("cannot mask the first nibble")
}

regex_pattern += "."

i += 2
patLen += 1
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
continue
}

Expand All @@ -85,23 +97,23 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
if c == "[" {
end := strings.Index(pattern[i:], "]")
if end == -1 {
return "", errors.New("unbalanced [")
return nil, errors.New("unbalanced [")
}

chunk := pattern[i+1 : i+end]
low, high, found := strings.Cut(chunk, "-")
if !found {
return "", errors.New("[] didn't contain a dash")
return nil, errors.New("[] didn't contain a dash")
}

_, err := strconv.Atoi(low)
if err != nil {
return "", errors.New("invalid number")
return nil, errors.New("invalid number")
}

_, err = strconv.Atoi(high)
if err != nil {
return "", errors.New("invalid number")
return nil, errors.New("invalid number")
}

regex_pattern += "."
Expand All @@ -112,6 +124,14 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
regex_pattern += "}"

i += end + 1
patLen += 1

if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
continue
}

Expand All @@ -120,7 +140,7 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
if c == "(" {
end := strings.Index(pattern[i:], ")")
if end == -1 {
return "", errors.New("unbalanced (")
return nil, errors.New("unbalanced (")
}

chunk := pattern[i+1 : i+end]
Expand All @@ -129,7 +149,7 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
regex_pattern += "("
for j, choice := range choices {
if !isHex(choice) {
return "", errors.New("choice not hex")
return nil, errors.New("choice not hex")
}

if j != 0 {
Expand All @@ -141,14 +161,21 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
regex_pattern += ")"

i += end + 1
patLen += len(choices)
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
continue
}

// input: 0?
// output: [\x00-\x0F]
if d == "?" {
if !isHex(c) {
return "", errors.New("not hex digit")
return nil, errors.New("not hex digit")
}

regex_pattern += "["
Expand All @@ -158,124 +185,61 @@ func RegexpPatternFromYaraPattern(pattern string) (string, error) {
regex_pattern += "]"

i += 2
patLen += 1
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
continue
}

// input: AB
// output: \xAB
if isHex(c) && isHex(d) {
regex_pattern += `\x` + strings.ToUpper(c+d)

byt, err := strconv.ParseInt(c+d, 16, 64)
if err != nil {
return nil, errors.New("not hex digit")
}
tmpNeedle = append(tmpNeedle, byte(byt))
i += 2
patLen += 1
continue
}

return "", errors.New("unexpected value")
return nil, errors.New("unexpected value")
}

return regex_pattern, nil
}

func RegexpFromYaraPattern(pattern string) (*binaryregexp.Regexp, error) {
regex_pattern, e := RegexpPatternFromYaraPattern(pattern)
if e != nil {
return nil, e
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
//tmpNeedle = make([]byte, 0) not needed at exit
}

r := binaryregexp.MustCompile(regex_pattern)
if r == nil {
return nil, errors.New("failed to compile regex")
}

return r, nil
}

type BinaryRegexpGroup struct {
patterns map[string]string

re *binaryregexp.Regexp
}

func NewBinaryRegexpGroup(patterns map[string]string) (*BinaryRegexpGroup, error) {

var pattern string

i := 0
pattern += "("
for k, v := range patterns {
if i != 0 {
pattern += "|"
}
i += 1

pattern += "(?P"
pattern += "<" + k + ">"
pattern += v
pattern += ")"
}
pattern += ")"

re := binaryregexp.MustCompile(pattern)
if re == nil {
return nil, errors.New("failed to compile regex")
}

return &BinaryRegexpGroup{
patterns: patterns,
re: re,
}, nil
}

type BinaryRegexGroupMatches struct {
g *BinaryRegexpGroup
matches [][]int
}

func (g *BinaryRegexpGroup) FindAllIndex(buf []byte, n int) *BinaryRegexGroupMatches {
matches := g.re.FindAllIndex(buf, n)

return &BinaryRegexGroupMatches{
g: g,
matches: matches,
}
}

// fetch the index of the subexp for the given regexp.
//
// this is called `(*Regexp) SubexpIndex` in recent Go,
// but doesn't seem to be implemented in binaryregexp.
// https://pkg.go.dev/regexp#Regexp.SubexpIndex
func SubexpIndex(re *binaryregexp.Regexp, name string) int {
for i, n := range re.SubexpNames() {
if n == name {
return i
}
}

return -1
return &RegexAndNeedle{patLen, regex_pattern, r, needle}, nil
}

// fetch the [start, end] pairs for the subexp with the given name in the given matches.
func SubexpIndexMatches(re *binaryregexp.Regexp, matches [][]int, name string) [][]int {
index := SubexpIndex(re, name)

var ret [][]int
for _, match := range matches {

start := match[2*index]
end := match[2*index+1]

if start == -1 && end == -1 {
continue
func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
matches := make([]int, 0)
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
for _, needleMatch := range needleMatches {
for _, reMatch := range regexInfo.re.FindAllIndex(data[needleMatch-regexInfo.len:needleMatch+regexInfo.len], -1) {
start := reMatch[0]
//end := reMatch[1]
matches = append(matches, start)
}

ret = append(ret, []int{start, end})
}

return ret
return matches
}

// fetch the [start, end] pairs for the subexp with the given name.
func (m *BinaryRegexGroupMatches) MatchesForSubexp(name string) [][]int {
return SubexpIndexMatches(m.g.re, m.matches, name)
type RegexAndNeedle struct {
len int
rawre string
re *binaryregexp.Regexp
needle []byte // longest fixed sub-sequence of regex
}
Loading

0 comments on commit 11d153d

Please sign in to comment.