Skip to content

Commit

Permalink
added support for custom regex on field scope (-fs "[regex pattern]") (
Browse files Browse the repository at this point in the history
…#571)

* added support for custom regex on field scope

* misc update

---------

Co-authored-by: sandeep <[email protected]>
  • Loading branch information
c3l3si4n and ehsandeep authored Sep 11, 2023
1 parent 28ecdc3 commit 52cc6d0
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ HEADLESS:
SCOPE:
-cs, -crawl-scope string[] in scope url regex to be followed by crawler
-cos, -crawl-out-scope string[] out of scope url regex to be excluded by crawler
-fs, -field-scope string pre-defined scope field (dn,rdn,fqdn) (default "rdn")
-fs, -field-scope string pre-defined scope field (dn,rdn,fqdn) or custom regex (e.g., '(company-staging.io|company.com)') (default "rdn")
-ns, -no-scope disables host based default scope
-do, -display-out-scope display external endpoint from scoped crawling

Expand Down
2 changes: 1 addition & 1 deletion cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.CreateGroup("scope", "Scope",
flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions),
flagSet.StringSliceVarP(&options.OutOfScope, "crawl-out-scope", "cos", nil, "out of scope url regex to be excluded by crawler", goflags.FileCommaSeparatedStringSliceOptions),
flagSet.StringVarP(&options.FieldScope, "field-scope", "fs", "rdn", "pre-defined scope field (dn,rdn,fqdn)"),
flagSet.StringVarP(&options.FieldScope, "field-scope", "fs", "rdn", "pre-defined scope field (dn,rdn,fqdn) or custom regex (e.g., '(company-staging.io|company.com)')"),
flagSet.BoolVarP(&options.NoScope, "no-scope", "ns", false, "disables host based default scope"),
flagSet.BoolVarP(&options.DisplayOutScope, "display-out-scope", "do", false, "display external endpoint from scoped crawling"),
)
Expand Down
24 changes: 18 additions & 6 deletions pkg/utils/scope/scope.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ import (

// Manager manages scope for crawling process
type Manager struct {
inScope []*regexp.Regexp
outOfScope []*regexp.Regexp
noScope bool
fieldScope dnsScopeField
inScope []*regexp.Regexp
outOfScope []*regexp.Regexp
noScope bool
fieldScope dnsScopeField
fieldScopePattern *regexp.Regexp
}

type dnsScopeField int
Expand All @@ -24,6 +25,7 @@ const (
dnDnsScopeField dnsScopeField = iota + 1
rdnDnsScopeField
fqdnDNSScopeField
customDNSScopeField
)

var stringToDNSScopeField = map[string]dnsScopeField{
Expand All @@ -39,7 +41,12 @@ func NewManager(inScope, outOfScope []string, fieldScope string, noScope bool) (
}

if scopeValue, ok := stringToDNSScopeField[fieldScope]; !ok {
return nil, fmt.Errorf("invalid dns scope field specified: %s", fieldScope)
manager.fieldScope = customDNSScopeField
if compiled, err := regexp.Compile(fieldScope); err != nil {
return nil, fmt.Errorf("could not compile regex %s: %s", fieldScope, err)
} else {
manager.fieldScopePattern = compiled
}
} else {
manager.fieldScope = scopeValue
}
Expand Down Expand Up @@ -108,7 +115,12 @@ func (m *Manager) validateURL(URL string) (bool, error) {

func (m *Manager) validateDNS(hostname, rootHostname string) (bool, error) {
parsed := net.ParseIP(hostname)

if m.fieldScope == customDNSScopeField {
// If we have a custom regex, we need to match it against the full hostname
if m.fieldScopePattern.MatchString(hostname) {
return true, nil
}
}
if m.fieldScope == fqdnDNSScopeField || parsed != nil {
matched := strings.EqualFold(hostname, rootHostname)
return matched, nil
Expand Down

0 comments on commit 52cc6d0

Please sign in to comment.