From 4c266e0a6f235f8dc3a1c86924606bb4e67beb2d Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Mon, 26 Jun 2023 12:10:07 +0300 Subject: [PATCH 01/24] docs: - --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4186b08..2ba3732 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,9 @@ * **[AlienVault's OTX](https://otx.alienvault.com/)** * **[BeVigil](https://bevigil.com)** * **[Common Crawl](https://commoncrawl.org/)** - * **[URLScan](https://urlscan.io/)** * **[Github](https://github.com)** * **[Intelligence X](https://intelx.io)** + * **[URLScan](https://urlscan.io/)** * **[Wayback Machine](https://archive.org/web/)** * With Wayback Machine, Parses URLs from `robots.txt` snapshots. * With Wayback Machine, Parses URLs from webpages' snapshots. From 22e0be05f15eaa9e9133d1b8e4519fe6e64b3599 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 15:04:07 +0000 Subject: [PATCH 02/24] chore(deps): bump github.com/valyala/fasthttp from 1.47.0 to 1.48.0 Bumps [github.com/valyala/fasthttp](https://github.com/valyala/fasthttp) from 1.47.0 to 1.48.0. - [Release notes](https://github.com/valyala/fasthttp/releases) - [Commits](https://github.com/valyala/fasthttp/compare/v1.47.0...v1.48.0) --- updated-dependencies: - dependency-name: github.com/valyala/fasthttp dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index dab40a9..28bba72 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/logrusorgru/aurora/v3 v3.0.0 github.com/spf13/pflag v1.0.5 github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 - github.com/valyala/fasthttp v1.47.0 + github.com/valyala/fasthttp v1.48.0 gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go.sum b/go.sum index 92cf66d..9f3807f 100644 --- a/go.sum +++ b/go.sum @@ -23,8 +23,8 @@ github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 h1:nrZ3ySNYwJ github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80/go.mod h1:iFyPdL66DjUD96XmzVL3ZntbzcflLnznH0fr99w5VqE= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= -github.com/valyala/fasthttp v1.47.0 h1:y7moDoxYzMooFpT5aHgNgVOQDrS3qlkfiP9mDtGGK9c= -github.com/valyala/fasthttp v1.47.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA= +github.com/valyala/fasthttp v1.48.0 h1:oJWvHb9BIZToTQS3MuQ2R3bJZiNSa2KiNdeI8A+79Tc= +github.com/valyala/fasthttp v1.48.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA= golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU= golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= From 6ef333890bc0622247fe7eb73e17a330a0aee231 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 28 Jun 2023 16:01:08 +0300 Subject: [PATCH 03/24] chore: Misc cleanup --- pkg/xurlfind3r/sources/bevigil/bevigil.go | 28 ++++----- .../sources/commoncrawl/commoncrawl.go | 57 +++++++++--------- pkg/xurlfind3r/sources/github/github.go | 18 +++--- pkg/xurlfind3r/sources/intelx/intelx.go | 28 ++++----- pkg/xurlfind3r/sources/otx/otx.go | 44 +++++++------- pkg/xurlfind3r/sources/urlscan/urlscan.go | 58 +++++++++++-------- pkg/xurlfind3r/sources/utils.go | 4 +- 7 files changed, 131 insertions(+), 106 deletions(-) diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index c60ccaa..3ee1437 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -9,7 +9,7 @@ import ( "github.com/valyala/fasthttp" ) -type response struct { +type Response struct { Domain string `json:"domain"` URLs []string `json:"urls"` } @@ -23,10 +23,8 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc defer close(URLsChannel) var ( - key string - err error - res *fasthttp.Response - headers = map[string]string{} + err error + key string ) key, err = sources.PickRandom(config.Keys.Bevigil) @@ -34,27 +32,31 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } + reqHeaders := map[string]string{} + if len(config.Keys.Bevigil) > 0 { - headers["X-Access-Token"] = key + reqHeaders["X-Access-Token"] = key } reqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", config.Domain) - res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", headers, nil) + var res *fasthttp.Response + + res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", reqHeaders, nil) if err != nil { return } - body := res.Body() + var data Response - var results response - - if err = json.Unmarshal(body, &results); err != nil { + if err = json.Unmarshal(res.Body(), &data); err != nil { return } - for _, i := range results.URLs { - URLsChannel <- sources.URL{Source: source.Name(), Value: i} + for index := range data.URLs { + URL := data.URLs[index] + + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } }() diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index f6acc56..edd857f 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -13,17 +13,17 @@ import ( "github.com/valyala/fasthttp" ) -type Source struct{} +type API struct { + ID string `json:"id"` + API string `json:"cdx-API"` +} -type CDXAPIResult struct { +type Response struct { URL string `json:"url"` Error string `json:"error"` } -type Index struct { - ID string `json:"id"` - CDX_API string `json:"cdx-API"` //nolint:revive,stylecheck // Is as is -} +type Source struct{} func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) @@ -33,37 +33,42 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc var ( err error - res *fasthttp.Response ) + var res *fasthttp.Response + res, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") if err != nil { return } - var commonCrawlIndexes []Index + var APIs []API - if err = json.Unmarshal(res.Body(), &commonCrawlIndexes); err != nil { + if err = json.Unmarshal(res.Body(), &APIs); err != nil { return } wg := new(sync.WaitGroup) - for index := range commonCrawlIndexes { + for index := range APIs { wg.Add(1) - commonCrawlIndex := commonCrawlIndexes[index] + API := APIs[index] go func(API string) { defer wg.Done() var ( - err error - headers = map[string]string{"Host": "index.commoncrawl.org"} - res *fasthttp.Response + err error + // headers = map[string]string{"Host": "index.commoncrawl.org"} + // res *fasthttp.Response ) - res, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, config.Domain), "", headers) + reqHeaders := map[string]string{"Host": "index.commoncrawl.org"} + + var res *fasthttp.Response + + res, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, config.Domain), "", reqHeaders) if err != nil { return } @@ -71,25 +76,25 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc scanner := bufio.NewScanner(bytes.NewReader(res.Body())) for scanner.Scan() { - var result CDXAPIResult + var data Response - if err = json.Unmarshal(scanner.Bytes(), &result); err != nil { + if err = json.Unmarshal(scanner.Bytes(), &data); err != nil { return } - if result.Error != "" { + if data.Error != "" { return } - URL := result.URL + URL := data.URL - if !sources.IsValid(URL) { - return - } + // if !sources.IsValid(URL) { + // return + // } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - return - } + // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + // return + // } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } @@ -97,7 +102,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc if scanner.Err() != nil { return } - }(commonCrawlIndex.CDX_API) + }(API.API) } wg.Wait() diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 4e05c8c..6fc6fcb 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -137,13 +137,13 @@ func proccesItems(items []item, domainRegexp *regexp.Regexp, name string, URLsCh } for _, URL = range domainRegexp.FindAllString(normalizeContent(line), -1) { - if !sources.IsValid(URL) { - continue - } + // if !sources.IsValid(URL) { + // continue + // } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - return - } + // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + // return + // } URLsChannel <- sources.URL{Source: name, Value: URL} } @@ -152,9 +152,9 @@ func proccesItems(items []item, domainRegexp *regexp.Regexp, name string, URLsCh for _, textMatch := range item.TextMatches { for _, URL = range domainRegexp.FindAllString(normalizeContent(textMatch.Fragment), -1) { - if !sources.IsValid(URL) { - continue - } + // if !sources.IsValid(URL) { + // continue + // } URLsChannel <- sources.URL{Source: name, Value: URL} } diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index fddccda..9e3ec87 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -12,7 +12,7 @@ import ( "github.com/valyala/fasthttp" ) -type searchResponseType struct { +type SearchResponse struct { ID string `json:"id"` Status int `json:"status"` } @@ -42,10 +42,12 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc defer close(URLsChannel) var ( - key string - err error - res *fasthttp.Response + err error + key string + body []byte + + res *fasthttp.Response ) key, err = sources.PickRandom(config.Keys.Intelx) @@ -62,14 +64,14 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc } searchURL := fmt.Sprintf("https://%s/phonebook/search?k=%s", intelXHost, intelXKey) - reqBody := requestBody{ + searchReqBody := requestBody{ Term: config.Domain, MaxResults: 100000, Media: 0, Timeout: 20, } - body, err = json.Marshal(reqBody) + body, err = json.Marshal(searchReqBody) if err != nil { return } @@ -79,7 +81,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var response searchResponseType + var response SearchResponse if err = json.Unmarshal(res.Body(), &response); err != nil { return @@ -105,13 +107,13 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc for _, hostname := range response.Selectors { URL := hostname.Selectvalue - if !sources.IsValid(URL) { - continue - } + // if !sources.IsValid(URL) { + // continue + // } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - return - } + // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + // return + // } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index e5a992b..2a6c909 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -10,9 +10,7 @@ import ( "github.com/valyala/fasthttp" ) -type Source struct{} - -type response struct { +type Response struct { URLList []struct { Domain string `json:"domain"` URL string `json:"url"` @@ -29,44 +27,50 @@ type response struct { ActualSize int `json:"actual_size"` } +type Source struct{} + func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) - var ( - err error - res *fasthttp.Response - ) - for page := 1; ; page++ { - res, err = httpclient.SimpleGet(fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", config.Domain, 200, page)) + var ( + err error + ) + + reqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", config.Domain, 200, page) + + var res *fasthttp.Response + + res, err = httpclient.SimpleGet(reqURL) if err != nil { return } - var results response + var data Response - if err = json.Unmarshal(res.Body(), &results); err != nil { + if err = json.Unmarshal(res.Body(), &data); err != nil { return } - for _, i := range results.URLList { - URL := i.URL + for index := range data.URLList { + URL := data.URLList[index].URL + // URL := i.URL - if !sources.IsValid(URL) { - continue - } + // if !sources.IsValid(URL) { + // continue + // } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - return - } + // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + // return + // } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } - if !results.HasNext { + if !data.HasNext { break } } diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index 8c4f926..936409f 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -4,13 +4,14 @@ package urlscan import ( "encoding/json" "net/url" + "strings" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/valyala/fasthttp" ) -type response struct { +type Response struct { Results []struct { Page struct { Domain string `json:"domain"` @@ -35,13 +36,13 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc defer close(URLsChannel) var ( - key string - err error - res *fasthttp.Response + err error + key string + searchAfter []interface{} - headers = map[string]string{ - "Content-Type": "application/json", - } + + // res *fasthttp.Response + resData Response ) key, err = sources.PickRandom(config.Keys.URLScan) @@ -49,8 +50,12 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } + reqHeaders := map[string]string{ + "Content-Type": "application/json", + } + if len(config.Keys.URLScan) > 0 { - headers["API-Key"] = key + reqHeaders["API-Key"] = key } for { @@ -65,42 +70,47 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc reqURL := baseURL + "?" + params.Encode() - res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", headers, nil) + var res *fasthttp.Response + + res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", reqHeaders, nil) if err != nil { return } - body := res.Body() - - var results response + var data Response - if err = json.Unmarshal(body, &results); err != nil { + if err = json.Unmarshal(res.Body(), &data); err != nil { return } - if results.Status == 429 { + if data.Status == 429 { break } - for _, i := range results.Results { - URL := i.Page.URL + for index := range data.Results { + URL := data.Results[index].Page.URL - if !sources.IsValid(URL) { - continue + if data.Results[index].Page.Domain == config.Domain || + strings.HasSuffix(data.Results[index].Page.Domain, config.Domain) { + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - return - } + // if !sources.IsValid(URL) { + // continue + // } + + // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + // return + // } - URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + // URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } - if !results.HasMore { + if !resData.HasMore { break } - lastResult := results.Results[len(results.Results)-1] + lastResult := resData.Results[len(resData.Results)-1] searchAfter = lastResult.Sort } }() diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 1fb49ff..9c2bbbd 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -37,7 +37,9 @@ func PickRandom[T any](v []T) (picked T, err error) { } func IsValid(URL string) (isValid bool) { - var err error + var ( + err error + ) _, err = hqgourl.Parse(URL) if err != nil { From 386cdbf10b60c3c4f0b9487f54c6cd4ba5f94054 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 28 Jun 2023 16:14:34 +0300 Subject: [PATCH 04/24] refactor: Make wayback snapshots parsing opt in resolves #32 --- cmd/xurlfind3r/main.go | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 0d879c2..393bb4a 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -26,10 +26,10 @@ var ( includeSubdomains bool - listSources bool - sourcesToUse []string - skipWaybackRobots bool - skipWaybackSource bool + listSources bool + sourcesToUse []string + parseWaybackRobots bool + parseWaybackSource bool filterPattern string matchPattern string @@ -47,13 +47,12 @@ func init() { // Handle CLI arguments, flags & help message (pflag) pflag.StringVarP(&domain, "domain", "d", "", "") - pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") pflag.BoolVarP(&listSources, "sources", "s", false, "") pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", sources.List, "") - pflag.BoolVar(&skipWaybackRobots, "skip-wayback-robots", false, "") - pflag.BoolVar(&skipWaybackSource, "skip-wayback-source", false, "") + pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") + pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") pflag.StringVarP(&filterPattern, "filter", "f", "", "") pflag.StringVarP(&matchPattern, "match", "m", "", "") @@ -72,16 +71,14 @@ func init() { h += " xurlfind3r [OPTIONS]\n" h += "\nTARGET:\n" - h += " -d, --domain string (sub)domain to match URLs\n" - - h += "\nSCOPE:\n" + h += " -d, --domain string domain to match URLs\n" h += " --include-subdomains bool match subdomain's URLs\n" h += "\nSOURCES:\n" h += " -s, --sources bool list sources\n" - h += fmt.Sprintf(" -u, --use-sources string sources to use (default: %s)\n", strings.Join(sources.List, ",")) - h += " --skip-wayback-robots bool with wayback, skip parsing robots.txt snapshots\n" - h += " --skip-wayback-source bool with wayback, skip parsing source code snapshots\n" + h += fmt.Sprintf(" -u, --use-sources strings sources to use (default: %s)\n", strings.Join(sources.List, ",")) + h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" + h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" h += "\nFILTER & MATCH:\n" h += " -f, --filter string regex to filter URLs\n" @@ -177,8 +174,8 @@ func main() { IncludeSubdomains: includeSubdomains, Sources: sourcesToUse, Keys: config.Keys, - ParseWaybackRobots: !skipWaybackRobots, - ParseWaybackSource: !skipWaybackSource, + ParseWaybackRobots: parseWaybackRobots, + ParseWaybackSource: parseWaybackSource, FilterPattern: filterPattern, Matchattern: matchPattern, } From 4610e8c0bbf94f3cc866f1a6041dfa30d3af9bb6 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 28 Jun 2023 16:33:53 +0300 Subject: [PATCH 05/24] feat: Add sources to use and to exclude options resolves #33 --- cmd/xurlfind3r/main.go | 16 +++++++++------- pkg/xurlfind3r/xurlfind3r.go | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 393bb4a..a681c44 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -14,7 +14,6 @@ import ( "github.com/hueristiq/hqgolog/levels" "github.com/hueristiq/xurlfind3r/internal/configuration" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r" - "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/logrusorgru/aurora/v3" "github.com/spf13/pflag" ) @@ -22,12 +21,12 @@ import ( var ( au aurora.Aurora - domain string - + domain string includeSubdomains bool listSources bool sourcesToUse []string + sourcesToExclude []string parseWaybackRobots bool parseWaybackSource bool @@ -50,7 +49,8 @@ func init() { pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") pflag.BoolVarP(&listSources, "sources", "s", false, "") - pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", sources.List, "") + pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", []string{}, "") + pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") @@ -75,8 +75,9 @@ func init() { h += " --include-subdomains bool match subdomain's URLs\n" h += "\nSOURCES:\n" - h += " -s, --sources bool list sources\n" - h += fmt.Sprintf(" -u, --use-sources strings sources to use (default: %s)\n", strings.Join(sources.List, ",")) + h += " -s, --sources bool list supported sources\n" + h += " -u, --use-sources strings comma(,) separated sources to use\n" + h += " -e, --exclude-sources strings comma(,) separated sources to exclude\n" h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" @@ -172,7 +173,8 @@ func main() { options := &xurlfind3r.Options{ Domain: domain, IncludeSubdomains: includeSubdomains, - Sources: sourcesToUse, + SourcesToUSe: sourcesToUse, + SourcesToExclude: sourcesToExclude, Keys: config.Keys, ParseWaybackRobots: parseWaybackRobots, ParseWaybackSource: parseWaybackSource, diff --git a/pkg/xurlfind3r/xurlfind3r.go b/pkg/xurlfind3r/xurlfind3r.go index 9278073..ed64e00 100644 --- a/pkg/xurlfind3r/xurlfind3r.go +++ b/pkg/xurlfind3r/xurlfind3r.go @@ -17,7 +17,8 @@ import ( type Options struct { Domain string IncludeSubdomains bool - Sources []string + SourcesToUSe []string + SourcesToExclude []string Keys sources.Keys ParseWaybackRobots bool ParseWaybackSource bool @@ -61,8 +62,13 @@ func New(options *Options) (finder *Finder, err error) { } } - for index := range options.Sources { - source := options.Sources[index] + // Sources To Use + if len(options.SourcesToUSe) < 1 { + options.SourcesToUSe = sources.List + } + + for index := range options.SourcesToUSe { + source := options.SourcesToUSe[index] switch source { case "bevigil": @@ -82,6 +88,13 @@ func New(options *Options) (finder *Finder, err error) { } } + // Sources To Exclude + for index := range options.SourcesToExclude { + source := options.SourcesToExclude[index] + + delete(finder.Sources, source) + } + return } From 70fb5f6bfc62134337fd875973f349891906260b Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:06:13 +0300 Subject: [PATCH 06/24] chore: - --- README.md | 14 +- cmd/xurlfind3r/main.go | 30 +-- go.mod | 1 - go.sum | 5 - .../httpclient/{httpclient.go => client.go} | 12 +- pkg/xurlfind3r/httpclient/user-agent.go | 52 ++++ pkg/xurlfind3r/sources/bevigil/bevigil.go | 7 +- .../sources/commoncrawl/commoncrawl.go | 26 +- pkg/xurlfind3r/sources/configuration.go | 1 + pkg/xurlfind3r/sources/github/github.go | 252 +++++++++++++----- pkg/xurlfind3r/sources/intelx/intelx.go | 49 ++-- pkg/xurlfind3r/sources/otx/otx.go | 21 +- pkg/xurlfind3r/sources/urlscan/urlscan.go | 42 ++- pkg/xurlfind3r/sources/utils.go | 21 -- pkg/xurlfind3r/sources/wayback/wayback.go | 18 +- .../sources/wayback/waybackrobots.go | 62 ++--- .../sources/wayback/waybacksource.go | 54 ++-- pkg/xurlfind3r/xurlfind3r.go | 6 +- 18 files changed, 384 insertions(+), 289 deletions(-) rename pkg/xurlfind3r/httpclient/{httpclient.go => client.go} (89%) create mode 100644 pkg/xurlfind3r/httpclient/user-agent.go diff --git a/README.md b/README.md index 2ba3732..588be10 100644 --- a/README.md +++ b/README.md @@ -160,16 +160,15 @@ USAGE: xurlfind3r [OPTIONS] TARGET: - -d, --domain string (sub)domain to match URLs - -SCOPE: + -d, --domain string domain to match URLs --include-subdomains bool match subdomain's URLs SOURCES: - -s, --sources bool list sources - -u, --use-sources string sources to use (default: bevigil,commoncrawl,github,intelx,otx,urlscan,wayback) - --skip-wayback-robots bool with wayback, skip parsing robots.txt snapshots - --skip-wayback-source bool with wayback, skip parsing source code snapshots + -s, --sources bool list supported sources + -u, --use-sources strings comma(,) separated sources to use + -e, --exclude-sources strings comma(,) separated sources to exclude + --parse-wayback-robots bool with wayback, parse robots.txt snapshots + --parse-wayback-source bool with wayback, parse source code snapshots FILTER & MATCH: -f, --filter string regex to filter URLs @@ -183,6 +182,7 @@ OUTPUT: CONFIGURATION: -c, --configuration string configuration file path (default: ~/.hueristiq/xurlfind3r/config.yaml) +pflag: help requested ``` ### Examples diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index a681c44..3ca7621 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -21,46 +21,38 @@ import ( var ( au aurora.Aurora - domain string - includeSubdomains bool - + domain string + includeSubdomains bool listSources bool sourcesToUse []string sourcesToExclude []string parseWaybackRobots bool parseWaybackSource bool - - filterPattern string - matchPattern string - - monochrome bool - output string - verbosity string - - YAMLConfigFile string + filterPattern string + matchPattern string + monochrome bool + output string + verbosity string + YAMLConfigFile string ) func init() { // defaults - defaultYAMLConfigFile := "~/.hueristiq/xurlfind3r/config.yaml" + defaultYAMLConfigFile := fmt.Sprintf("~/.hueristiq/%s/config.yaml", configuration.NAME) // Handle CLI arguments, flags & help message (pflag) pflag.StringVarP(&domain, "domain", "d", "", "") pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") - pflag.BoolVarP(&listSources, "sources", "s", false, "") pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", []string{}, "") pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") - pflag.StringVarP(&filterPattern, "filter", "f", "", "") pflag.StringVarP(&matchPattern, "match", "m", "", "") - pflag.BoolVar(&monochrome, "no-color", false, "") pflag.StringVarP(&output, "output", "o", "", "") pflag.StringVarP(&verbosity, "verbosity", "v", string(levels.LevelInfo), "") - pflag.StringVarP(&YAMLConfigFile, "configuration", "c", defaultYAMLConfigFile, "") pflag.CommandLine.SortFlags = false @@ -104,7 +96,7 @@ func init() { Colorize: !monochrome, })) - // Create | Update configuration + // Create or Update configuration if strings.HasPrefix(YAMLConfigFile, "~") { home, err := os.UserHomeDir() if err != nil { @@ -164,7 +156,7 @@ func main() { hqgolog.Info().Msgf("finding URLs for %v.", au.Underline(domain).Bold()) if includeSubdomains { - hqgolog.Info().Msg("`--include-subdomains` used: includes subdomains' URLs.") + hqgolog.Info().Msg("`--include-subdomains` used: match subdomain's URLs.") } hqgolog.Print().Msg("") diff --git a/go.mod b/go.mod index 28bba72..295c35b 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.20 require ( dario.cat/mergo v1.0.0 - github.com/corpix/uarand v0.2.0 github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 github.com/hueristiq/hqgolog v0.0.0-20230623101640-92de7a10a4bb github.com/hueristiq/hqgourl v0.0.0-20230623095947-4dee5ebb9a96 diff --git a/go.sum b/go.sum index 9f3807f..1510b43 100644 --- a/go.sum +++ b/go.sum @@ -2,9 +2,6 @@ dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= -github.com/corpix/uarand v0.2.0 h1:U98xXwud/AVuCpkpgfPF7J5TQgr7R5tqT8VZP5KWbzE= -github.com/corpix/uarand v0.2.0/go.mod h1:/3Z1QIqWkDIhf6XWn/08/uMHoQ8JUoTIKc2iPchBOmM= -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 h1:r4ze6pX8H//X4SJEIcn8wHPgAhaGKEaa44lyHh1epXY= github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8/go.mod h1:CzhJzxz2rv/NMKNz5b4eKFh1epdcED05YTHT32NFyrI= github.com/hueristiq/hqgolog v0.0.0-20230623101640-92de7a10a4bb h1:DQUVIiWnrTDQ4MP6UJw7/fMkySN+PYonDhlgBh31DDI= @@ -15,10 +12,8 @@ github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0P github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/logrusorgru/aurora/v3 v3.0.0 h1:R6zcoZZbvVcGMvDCKo45A9U/lzYyzl5NfYIvznmDfE4= github.com/logrusorgru/aurora/v3 v3.0.0/go.mod h1:vsR12bk5grlLvLXAYrBsb5Oc/N+LxAlxggSjiwMnCUc= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 h1:nrZ3ySNYwJbSpD6ce9duiP+QkD3JuLCcWkdaehUS/3Y= github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80/go.mod h1:iFyPdL66DjUD96XmzVL3ZntbzcflLnznH0fr99w5VqE= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= diff --git a/pkg/xurlfind3r/httpclient/httpclient.go b/pkg/xurlfind3r/httpclient/client.go similarity index 89% rename from pkg/xurlfind3r/httpclient/httpclient.go rename to pkg/xurlfind3r/httpclient/client.go index b775b3e..d648501 100644 --- a/pkg/xurlfind3r/httpclient/httpclient.go +++ b/pkg/xurlfind3r/httpclient/client.go @@ -3,7 +3,6 @@ package httpclient import ( "fmt" - "github.com/corpix/uarand" "github.com/valyala/fasthttp" ) @@ -25,14 +24,21 @@ func httpRequestWrapper(req *fasthttp.Request) (res *fasthttp.Response, err erro return } -func Request(method, URL, cookies string, headers map[string]string, body []byte) (*fasthttp.Response, error) { +func Request(method, URL, cookies string, headers map[string]string, body []byte) (res *fasthttp.Response, err error) { req := fasthttp.AcquireRequest() req.SetRequestURI(URL) req.SetBody(body) req.Header.SetMethod(method) - req.Header.Set("User-Agent", uarand.GetRandom()) + var agent string + + agent, err = UserAgent() + if err != nil { + return + } + + req.Header.Set("User-Agent", agent) req.Header.Set("Accept", "*/*") req.Header.Set("Accept-Language", "en") req.Header.Set("Connection", "close") diff --git a/pkg/xurlfind3r/httpclient/user-agent.go b/pkg/xurlfind3r/httpclient/user-agent.go new file mode 100644 index 0000000..639c3b3 --- /dev/null +++ b/pkg/xurlfind3r/httpclient/user-agent.go @@ -0,0 +1,52 @@ +package httpclient + +import ( + "crypto/rand" + "fmt" + "math/big" +) + +var agents = []string{ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53", + "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)", +} + +func UserAgent() (agent string, err error) { + return pickRandom(agents) +} + +func pickRandom[T any](v []T) (picked T, err error) { + length := len(v) + + if length == 0 { + return + } + + // Generate a cryptographically secure random index + max := big.NewInt(int64(length)) + + var indexBig *big.Int + + indexBig, err = rand.Int(rand.Reader, max) + if err != nil { + err = fmt.Errorf("failed to generate random index: %v", err) + + return + } + + index := indexBig.Int64() + + // Return the element at the random index + picked = v[index] + + return +} diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 3ee1437..363ff6c 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -24,9 +24,10 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc var ( err error - key string ) + var key string + key, err = sources.PickRandom(config.Keys.Bevigil) if key == "" || err != nil { return @@ -56,6 +57,10 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc for index := range data.URLs { URL := data.URLs[index] + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } }() diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index edd857f..49c18e5 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -35,16 +35,16 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc err error ) - var res *fasthttp.Response + var indexesRes *fasthttp.Response - res, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") + indexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") if err != nil { return } var APIs []API - if err = json.Unmarshal(res.Body(), &APIs); err != nil { + if err = json.Unmarshal(indexesRes.Body(), &APIs); err != nil { return } @@ -60,20 +60,18 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc var ( err error - // headers = map[string]string{"Host": "index.commoncrawl.org"} - // res *fasthttp.Response ) - reqHeaders := map[string]string{"Host": "index.commoncrawl.org"} + contentReqHeaders := map[string]string{"Host": "index.commoncrawl.org"} - var res *fasthttp.Response + var contentRes *fasthttp.Response - res, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, config.Domain), "", reqHeaders) + contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, config.Domain), "", contentReqHeaders) if err != nil { return } - scanner := bufio.NewScanner(bytes.NewReader(res.Body())) + scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body())) for scanner.Scan() { var data Response @@ -88,13 +86,9 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc URL := data.URL - // if !sources.IsValid(URL) { - // return - // } - - // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - // return - // } + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } diff --git a/pkg/xurlfind3r/sources/configuration.go b/pkg/xurlfind3r/sources/configuration.go index e2ba789..d49b8eb 100644 --- a/pkg/xurlfind3r/sources/configuration.go +++ b/pkg/xurlfind3r/sources/configuration.go @@ -10,6 +10,7 @@ type Configuration struct { Keys Keys ParseWaybackRobots bool ParseWaybackSource bool + LinkFinderRegex *regexp.Regexp URLsRegex *regexp.Regexp MediaURLsRegex *regexp.Regexp RobotsURLsRegex *regexp.Regexp diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 6fc6fcb..027d9c2 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -18,23 +18,23 @@ import ( "github.com/valyala/fasthttp" ) -type Source struct{} - -type textMatch struct { - Fragment string `json:"fragment"` +type Response struct { + TotalCount int `json:"total_count"` + Items []Item `json:"items"` } -type item struct { +type Item struct { Name string `json:"name"` HTMLURL string `json:"html_url"` - TextMatches []textMatch `json:"text_matches"` + TextMatches []TextMatch `json:"text_matches"` } -type response struct { - TotalCount int `json:"total_count"` - Items []item `json:"items"` +type TextMatch struct { + Fragment string `json:"fragment"` } +type Source struct{} + func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) @@ -47,7 +47,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc tokens := NewTokenManager(config.Keys.GitHub) - searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%s&sort=created&order=asc", config.Domain) + searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", config.Domain) source.Enumerate(searchURL, config.URLsRegex, tokens, URLsChannel, config) }() @@ -55,7 +55,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return URLsChannel } -func (source *Source) Enumerate(searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { +func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { token := tokens.Get() if token.RetryAfter > 0 { @@ -66,43 +66,96 @@ func (source *Source) Enumerate(searchURL string, domainRegexp *regexp.Regexp, t } } - var ( - err error - headers = map[string]string{ - "Accept": "application/vnd.github.v3.text-match+json", - "Authorization": "token " + token.Hash, - } - res *fasthttp.Response - ) + reqHeaders := map[string]string{ + "Accept": "application/vnd.github.v3.text-match+json", + "Authorization": "token " + token.Hash, + } - res, err = httpclient.Request(fasthttp.MethodGet, searchURL, "", headers, nil) + searchRes, err := httpclient.Request(fasthttp.MethodGet, searchURL, "", reqHeaders, nil) - isForbidden := res != nil && res.StatusCode() == fasthttp.StatusForbidden + isForbidden := searchRes != nil && searchRes.StatusCode() == fasthttp.StatusForbidden if err != nil && !isForbidden { return } - ratelimitRemaining, _ := strconv.ParseInt(string(res.Header.Peek("X-Ratelimit-Remaining")), 10, 64) + ratelimitRemaining, _ := strconv.ParseInt(string(searchRes.Header.Peek("X-Ratelimit-Remaining")), 10, 64) + if isForbidden && ratelimitRemaining == 0 { - retryAfterSeconds, _ := strconv.ParseInt(string(res.Header.Peek("Retry-After")), 10, 64) + retryAfterSeconds, _ := strconv.ParseInt(string(searchRes.Header.Peek("Retry-After")), 10, 64) tokens.setCurrentTokenExceeded(retryAfterSeconds) - source.Enumerate(searchURL, domainRegexp, tokens, URLsChannel, config) + source.Enumerate(searchURL, URLsRegex, tokens, URLsChannel, config) } - var results response + var searchResData Response - if err = json.Unmarshal(res.Body(), &results); err != nil { + if err = json.Unmarshal(searchRes.Body(), &searchResData); err != nil { return } - err = proccesItems(results.Items, domainRegexp, source.Name(), URLsChannel, config) - if err != nil { - return + // Process Items + for index := range searchResData.Items { + item := searchResData.Items[index] + + reqURL := getRawContentURL(item.HTMLURL) + + var contentRes *fasthttp.Response + + contentRes, err = httpclient.SimpleGet(reqURL) + if err != nil { + continue + } + + if contentRes.StatusCode() != fasthttp.StatusOK { + continue + } + + scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body())) + + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + + URLs := URLsRegex.FindAllString(normalizeContent(line), -1) + + for index := range URLs { + URL := URLs[index] + URL = fixURL(URL) + + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } + + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + } + } + + if scanner.Err() != nil { + return + } + + for index := range item.TextMatches { + textMatch := item.TextMatches[index] + + URLs := URLsRegex.FindAllString(normalizeContent(textMatch.Fragment), -1) + + for index := range URLs { + URL := URLs[index] + URL = fixURL(URL) + + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } + + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + } + } } - linksHeader := linkheader.Parse(string(res.Header.Peek("Link"))) + linksHeader := linkheader.Parse(string(searchRes.Header.Peek("Link"))) for _, link := range linksHeader { if link.Rel == "next" { @@ -111,72 +164,123 @@ func (source *Source) Enumerate(searchURL string, domainRegexp *regexp.Regexp, t return } - source.Enumerate(nextURL, domainRegexp, tokens, URLsChannel, config) + source.Enumerate(nextURL, URLsRegex, tokens, URLsChannel, config) } } } -func proccesItems(items []item, domainRegexp *regexp.Regexp, name string, URLsChannel chan sources.URL, config *sources.Configuration) (err error) { - for _, item := range items { - var ( - res *fasthttp.Response - URL string - ) +func getRawContentURL(URL string) (rawContentURL string) { + rawContentURL = URL + rawContentURL = strings.ReplaceAll(rawContentURL, "https://github.com/", "https://raw.githubusercontent.com/") + rawContentURL = strings.ReplaceAll(rawContentURL, "/blob/", "/") - res, err = httpclient.SimpleGet(rawContentURL(item.HTMLURL)) - if err != nil { - continue - } + return +} - if res.StatusCode() == fasthttp.StatusOK { - scanner := bufio.NewScanner(bytes.NewReader(res.Body())) - for scanner.Scan() { - line := scanner.Text() - if line == "" { - continue - } +func normalizeContent(content string) (normalizedContent string) { + normalizedContent = content + normalizedContent, _ = url.QueryUnescape(normalizedContent) + normalizedContent = strings.ReplaceAll(normalizedContent, "\\t", "") + normalizedContent = strings.ReplaceAll(normalizedContent, "\\n", "") + + return +} - for _, URL = range domainRegexp.FindAllString(normalizeContent(line), -1) { - // if !sources.IsValid(URL) { - // continue - // } +func fixURL(URL string) (fixedURL string) { + fixedURL = URL - // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - // return - // } + // ',",`, + quotes := []rune{'\'', '"', '`'} - URLsChannel <- sources.URL{Source: name, Value: URL} - } - } + for i := range quotes { + quote := quotes[i] + + indexOfQuote := findUnbalancedQuote(URL, quote) + if indexOfQuote <= len(fixedURL) && indexOfQuote >= 0 { + fixedURL = fixedURL[:indexOfQuote] } + } - for _, textMatch := range item.TextMatches { - for _, URL = range domainRegexp.FindAllString(normalizeContent(textMatch.Fragment), -1) { - // if !sources.IsValid(URL) { - // continue - // } + // (),[],{} + parentheses := []struct { + Opening, Closing rune + }{ + {'[', ']'}, + {'(', ')'}, + {'{', '}'}, + } - URLsChannel <- sources.URL{Source: name, Value: URL} - } + for i := range parentheses { + parenthesis := parentheses[i] + + indexOfParenthesis := findUnbalancedBracket(URL, parenthesis.Opening, parenthesis.Closing) + if indexOfParenthesis <= len(fixedURL) && indexOfParenthesis >= 0 { + fixedURL = fixedURL[:indexOfParenthesis] } } + // ; + indexOfComma := strings.Index(fixedURL, ";") + if indexOfComma <= len(fixedURL) && indexOfComma >= 0 { + fixedURL = fixedURL[:indexOfComma] + } + return } -func normalizeContent(content string) string { - content, _ = url.QueryUnescape(content) - content = strings.ReplaceAll(content, "\\t", "") - content = strings.ReplaceAll(content, "\\n", "") +func findUnbalancedQuote(s string, quoteChar rune) int { + insideQuotes := false + + for _, ch := range s { + if ch == quoteChar { + if insideQuotes { + insideQuotes = false + } else { + insideQuotes = true + } + } + } + + // If still inside quotes at the end of the string, + // find the index of the opening quote + if insideQuotes { + for i, ch := range s { + if ch == quoteChar { + return i + } + } + } - return content + return -1 // return -1 if all quotes are balanced } -func rawContentURL(URL string) string { - URL = strings.ReplaceAll(URL, "https://github.com/", "https://raw.githubusercontent.com/") - URL = strings.ReplaceAll(URL, "/blob/", "/") +func findUnbalancedBracket(s string, openChar, closeChar rune) int { + openCount := 0 + + var firstOpenIndex int + + for i, ch := range s { + if ch == openChar { + if openCount == 0 { + firstOpenIndex = i + } + + openCount++ + } else if ch == closeChar { + openCount-- + + if openCount < 0 { + return i // Found an unbalanced closing bracket + } + } + } + + // If there are unmatched opening brackets + if openCount > 0 { + return firstOpenIndex + } - return URL + return -1 // All brackets are balanced } func (source *Source) Name() string { diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index 9e3ec87..700dd14 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -4,6 +4,7 @@ package intelx import ( "encoding/json" "fmt" + "net/mail" "strings" "time" @@ -43,19 +44,20 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc var ( err error - key string - - body []byte - - res *fasthttp.Response ) + var key string + key, err = sources.PickRandom(config.Keys.Intelx) if key == "" || err != nil { return } parts := strings.Split(key, ":") + if len(parts) != 2 { + return + } + intelXHost := parts[0] intelXKey := parts[1] @@ -71,23 +73,27 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc Timeout: 20, } + var body []byte + body, err = json.Marshal(searchReqBody) if err != nil { return } + var res *fasthttp.Response + res, err = httpclient.SimplePost(searchURL, "application/json", body) if err != nil { return } - var response SearchResponse + var resData SearchResponse - if err = json.Unmarshal(res.Body(), &response); err != nil { + if err = json.Unmarshal(res.Body(), &resData); err != nil { return } - resultsURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, response.ID) + resultsURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, resData.ID) status := 0 for status == 0 || status == 3 { @@ -96,24 +102,24 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var response searchResultType + var resData searchResultType - if err = json.Unmarshal(res.Body(), &response); err != nil { + if err = json.Unmarshal(res.Body(), &resData); err != nil { return } - status = response.Status + status = resData.Status - for _, hostname := range response.Selectors { + for _, hostname := range resData.Selectors { URL := hostname.Selectvalue - // if !sources.IsValid(URL) { - // continue - // } + if isEmail(URL) { + continue + } - // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - // return - // } + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } @@ -123,6 +129,13 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } +func isEmail(URL string) (isEmail bool) { + _, err := mail.ParseAddress(URL) + isEmail = err == nil + + return +} + func (source *Source) Name() string { return "intelx" } diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index 2a6c909..2a14e70 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -49,28 +49,23 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var data Response + var resData Response - if err = json.Unmarshal(res.Body(), &data); err != nil { + if err = json.Unmarshal(res.Body(), &resData); err != nil { return } - for index := range data.URLList { - URL := data.URLList[index].URL - // URL := i.URL + for index := range resData.URLList { + URL := resData.URLList[index].URL - // if !sources.IsValid(URL) { - // continue - // } - - // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - // return - // } + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } - if !data.HasNext { + if !resData.HasNext { break } } diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index 936409f..ad56bf1 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -35,17 +35,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc go func() { defer close(URLsChannel) - var ( - err error - key string - - searchAfter []interface{} - - // res *fasthttp.Response - resData Response - ) - - key, err = sources.PickRandom(config.Keys.URLScan) + key, err := sources.PickRandom(config.Keys.URLScan) if key == "" || err != nil { return } @@ -58,6 +48,8 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc reqHeaders["API-Key"] = key } + var searchAfter []interface{} + for { baseURL := "https://urlscan.io/api/v1/search/" params := url.Values{} @@ -77,33 +69,29 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var data Response + var resData Response - if err = json.Unmarshal(res.Body(), &data); err != nil { + if err = json.Unmarshal(res.Body(), &resData); err != nil { return } - if data.Status == 429 { + if resData.Status == 429 { break } - for index := range data.Results { - URL := data.Results[index].Page.URL + for index := range resData.Results { + URL := resData.Results[index].Page.URL - if data.Results[index].Page.Domain == config.Domain || - strings.HasSuffix(data.Results[index].Page.Domain, config.Domain) { - URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + if resData.Results[index].Page.Domain != config.Domain || + !strings.HasSuffix(resData.Results[index].Page.Domain, config.Domain) { + continue } - // if !sources.IsValid(URL) { - // continue - // } - - // if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { - // return - // } + if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + return + } - // URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } if !resData.HasMore { diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 9c2bbbd..182b19f 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -4,7 +4,6 @@ import ( "crypto/rand" "fmt" "math/big" - "net/mail" "github.com/hueristiq/hqgourl" ) @@ -36,26 +35,6 @@ func PickRandom[T any](v []T) (picked T, err error) { return } -func IsValid(URL string) (isValid bool) { - var ( - err error - ) - - _, err = hqgourl.Parse(URL) - if err != nil { - return - } - - _, err = mail.ParseAddress(URL) - if err == nil { - return - } - - isValid = true - - return -} - func IsInScope(URL, domain string, includeSubdomains bool) (isInScope bool) { parsedURL, err := hqgourl.Parse(URL) if err != nil { diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go index b3e8378..09f3518 100644 --- a/pkg/xurlfind3r/sources/wayback/wayback.go +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -29,6 +29,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc go func() { defer close(URLsChannel) + // Get wayback URLs waybackURLs := make(chan string) go func() { @@ -58,6 +59,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc } }() + // Process wayback Snapshots wg := &sync.WaitGroup{} for URL := range waybackURLs { @@ -66,10 +68,6 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc go func(URL string) { defer wg.Done() - if !sources.IsValid(URL) { - return - } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { return } @@ -86,11 +84,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc if config.ParseWaybackRobots && config.RobotsURLsRegex.MatchString(URL) { - for robotsURL := range parseWaybackRobots(URL) { - if !sources.IsValid(robotsURL) { - continue - } - + for robotsURL := range parseWaybackRobots(config, URL) { if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { return } @@ -99,11 +93,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc } } else if config.ParseWaybackSource && !config.RobotsURLsRegex.MatchString(URL) { - for sourceURL := range parseWaybackSource(URL, config.URLsRegex) { - if !sources.IsValid(sourceURL) { - continue - } - + for sourceURL := range parseWaybackSource(config, URL) { if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { return } diff --git a/pkg/xurlfind3r/sources/wayback/waybackrobots.go b/pkg/xurlfind3r/sources/wayback/waybackrobots.go index bc7a28d..5fafa87 100644 --- a/pkg/xurlfind3r/sources/wayback/waybackrobots.go +++ b/pkg/xurlfind3r/sources/wayback/waybackrobots.go @@ -7,92 +7,86 @@ import ( "sync" "github.com/hueristiq/hqgourl" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) -func parseWaybackRobots(URL string) (URLs chan string) { - URLs = make(chan string) +func parseWaybackRobots(config *sources.Configuration, URL string) (robotsURLs chan string) { + robotsURLs = make(chan string) + + robotsEntryRegex := regexp.MustCompile(`Disallow:\s?.+`) go func() { - defer close(URLs) + defer close(robotsURLs) // retrieve snapshots - var ( - err error - snapshots [][2]string - ) - - snapshots, err = getWaybackSnapshots(URL) + snapshots, err := getWaybackSnapshots(URL) if err != nil { return } - // retrieve conteny + // retrieve and parse snapshots' content for robotsURLs wg := &sync.WaitGroup{} - for _, row := range snapshots { + for index := range snapshots { + row := snapshots[index] + wg.Add(1) go func(row [2]string) { defer wg.Done() - var ( - err error - content string - ) - - content, err = getWaybackContent(row) + content, err := getWaybackContent(row) if err != nil { return } - pattern := regexp.MustCompile(`Disallow:\s?.+`) - - disallowed := pattern.FindAllStringSubmatch(content, -1) + disallowed := robotsEntryRegex.FindAllStringSubmatch(content, -1) if len(disallowed) < 1 { return } - for _, entry := range disallowed { + for index := range disallowed { + entry := disallowed[index] + temp := strings.Split(entry[0], "Disallow:") if len(temp) <= 1 { continue } - endpoint := strings.Trim(temp[1], " ") + robotsURL := strings.Trim(temp[1], " ") - if endpoint == "/" || endpoint == "*" || endpoint == "" { + if robotsURL == "/" || robotsURL == "*" || robotsURL == "" { continue } - endpoint = strings.ReplaceAll(endpoint, "*", "") + robotsURL = strings.ReplaceAll(robotsURL, "*", "") - for strings.HasPrefix(endpoint, "/") { - if len(endpoint) >= 1 { - endpoint = endpoint[1:] // Ex. /*/test or /*/*/demo + for strings.HasPrefix(robotsURL, "/") { + if len(robotsURL) >= 1 { + robotsURL = robotsURL[1:] // Ex. /*/test or /*/*/demo } else { continue } } - for strings.HasSuffix(endpoint, "/") { - if len(endpoint) >= 1 { - endpoint = endpoint[0 : len(endpoint)-1] + for strings.HasSuffix(robotsURL, "/") { + if len(robotsURL) >= 1 { + robotsURL = robotsURL[0 : len(robotsURL)-1] } else { continue } } - parsedURL, err := hqgourl.Parse(row[1]) + parsedURL, err := hqgourl.Parse(URL) if err != nil { continue } - endpoint = filepath.Join(parsedURL.Domain, endpoint) - endpoint = parsedURL.Scheme + "://" + endpoint + robotsURL = parsedURL.Scheme + "://" + filepath.Join(parsedURL.Domain, robotsURL) - URLs <- endpoint + robotsURLs <- robotsURL } }(row) } diff --git a/pkg/xurlfind3r/sources/wayback/waybacksource.go b/pkg/xurlfind3r/sources/wayback/waybacksource.go index 5bce161..c95e1b1 100644 --- a/pkg/xurlfind3r/sources/wayback/waybacksource.go +++ b/pkg/xurlfind3r/sources/wayback/waybacksource.go @@ -3,59 +3,45 @@ package wayback import ( "fmt" "mime" - "regexp" "strings" "sync" "github.com/hueristiq/hqgourl" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) -func parseWaybackSource(URL string, URLsRegex *regexp.Regexp) (URLs chan string) { - URLs = make(chan string) - - parsedURL, err := hqgourl.Parse(URL) - if err != nil { - return - } - - escapedDomain := regexp.QuoteMeta(parsedURL.ETLDPlusOne) - pattern := fmt.Sprintf(`https?://([a-z0-9.-]*\.)?%s(/[a-zA-Z0-9()/*\-+_~:,.?#=]*)?`, escapedDomain) - re := regexp.MustCompile(pattern) +func parseWaybackSource(config *sources.Configuration, URL string) (sourceURLs chan string) { + sourceURLs = make(chan string) go func() { - defer close(URLs) + defer close(sourceURLs) // retrieve snapshots - var ( - err error - snapshots [][2]string - ) - - snapshots, err = getWaybackSnapshots(URL) + snapshots, err := getWaybackSnapshots(URL) if err != nil { return } - // retrieve content + // retrieve and parse snapshots' content for robotsURLs wg := &sync.WaitGroup{} - for _, row := range snapshots { + for index := range snapshots { + row := snapshots[index] + wg.Add(1) go func(row [2]string) { defer wg.Done() - var ( - err error - content string - ) - - content, err = getWaybackContent(row) + content, err := getWaybackContent(row) if err != nil { return } - for _, sourceURL := range URLsRegex.FindAllString(content, -1) { + links := config.LinkFinderRegex.FindAllString(content, -1) + + for index := range links { + sourceURL := links[index] // remove beginning and ending quotes sourceURL = strings.Trim(sourceURL, "\"") sourceURL = strings.Trim(sourceURL, "'") @@ -74,10 +60,10 @@ func parseWaybackSource(URL string, URLsRegex *regexp.Regexp) (URLs chan string) } if parsedSourceURL.IsAbs() { - matches := re.FindAllString(sourceURL, -1) + matches := config.URLsRegex.FindAllString(sourceURL, -1) for _, match := range matches { - URLs <- match + sourceURLs <- match } } else { _, _, err := mime.ParseMediaType(sourceURL) @@ -85,10 +71,10 @@ func parseWaybackSource(URL string, URLsRegex *regexp.Regexp) (URLs chan string) continue } - matches := re.FindAllString(sourceURL, -1) + matches := config.URLsRegex.FindAllString(sourceURL, -1) for _, match := range matches { - URLs <- match + sourceURLs <- match } if len(matches) > 0 { @@ -98,9 +84,9 @@ func parseWaybackSource(URL string, URLsRegex *regexp.Regexp) (URLs chan string) // remove beginning slash sourceURL = strings.TrimLeft(sourceURL, "/") - sourceURL = fmt.Sprintf("%s://%s/%s", parsedURL.Scheme, parsedURL.Domain, sourceURL) + sourceURL = fmt.Sprintf("%s://%s/%s", parsedSourceURL.Scheme, parsedSourceURL.Domain, sourceURL) - URLs <- sourceURL + sourceURLs <- sourceURL } } }(row) diff --git a/pkg/xurlfind3r/xurlfind3r.go b/pkg/xurlfind3r/xurlfind3r.go index ed64e00..3010a42 100644 --- a/pkg/xurlfind3r/xurlfind3r.go +++ b/pkg/xurlfind3r/xurlfind3r.go @@ -1,6 +1,7 @@ package xurlfind3r import ( + "fmt" "regexp" "sync" @@ -42,8 +43,9 @@ func New(options *Options) (finder *Finder, err error) { Keys: options.Keys, ParseWaybackRobots: options.ParseWaybackRobots, ParseWaybackSource: options.ParseWaybackSource, - URLsRegex: regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`), //nolint:gocritic // Works so far - MediaURLsRegex: regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`), + LinkFinderRegex: regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`), //nolint:gocritic // Works so far + URLsRegex: regexp.MustCompile(fmt.Sprintf(`https?://(?:[\w.-]+\.)?%s(?:/[\w.-]*)*(?:\?[^\s#]*)?(?:#[^\s]*)?`, regexp.QuoteMeta(options.Domain))), + MediaURLsRegex: regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf)(?:\?|#|$)`), RobotsURLsRegex: regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`), }, } From df1f17791c1f46e1d03d5bee350312f963084f7b Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Mon, 24 Jul 2023 09:07:08 +0300 Subject: [PATCH 07/24] chore: - --- .github/workflows/release.yml | 4 +++- .goreleaser.yaml | 2 -- cmd/xurlfind3r/main.go | 36 ++++++++++++++++++----------------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21da946..0e20215 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -26,8 +26,10 @@ jobs: name: Run GoReleaser uses: goreleaser/goreleaser-action@v4 with: - args: "release --clean" + distribution: goreleaser version: latest + args: "release --clean" + workdir: . env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" SLACK_WEBHOOK: "${{ secrets.SLACK_WEBHOOK }}" diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 37cb102..fa08bdf 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -39,8 +39,6 @@ archives: id: tgz builds: [xurlfind3r-cli] format: tar.gz - replacements: - darwin: macOS format_overrides: - goos: windows diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 3ca7621..165148a 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -43,7 +43,7 @@ func init() { // Handle CLI arguments, flags & help message (pflag) pflag.StringVarP(&domain, "domain", "d", "", "") pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") - pflag.BoolVarP(&listSources, "sources", "s", false, "") + pflag.BoolVar(&listSources, "sources", false, "") pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", []string{}, "") pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") @@ -62,28 +62,30 @@ func init() { h := "USAGE:\n" h += " xurlfind3r [OPTIONS]\n" - h += "\nTARGET:\n" - h += " -d, --domain string domain to match URLs\n" - h += " --include-subdomains bool match subdomain's URLs\n" + h += "\nINPUT:\n" + h += " -d, --domain string domain to match URLs\n" + + h += "\nSCOPE:\n" + h += " --include-subdomains bool match subdomain's URLs\n" h += "\nSOURCES:\n" - h += " -s, --sources bool list supported sources\n" - h += " -u, --use-sources strings comma(,) separated sources to use\n" - h += " -e, --exclude-sources strings comma(,) separated sources to exclude\n" - h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" - h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" + h += " --sources bool list supported sources\n" + h += " -u, --use-sources string[] comma(,) separated sources to use\n" + h += " -e, --exclude-sources string[] comma(,) separated sources to exclude\n" + h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" + h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" h += "\nFILTER & MATCH:\n" - h += " -f, --filter string regex to filter URLs\n" - h += " -m, --match string regex to match URLs\n" + h += " -f, --filter string regex to filter URLs\n" + h += " -m, --match string regex to match URLs\n" h += "\nOUTPUT:\n" - h += " --no-color bool no color mode\n" - h += " -o, --output string output URLs file path\n" - h += fmt.Sprintf(" -v, --verbosity string debug, info, warning, error, fatal or silent (default: %s)\n", string(levels.LevelInfo)) + h += " --no-color bool disable colored output\n" + h += " -o, --output string output URLs file path\n" + h += fmt.Sprintf(" -v, --verbosity string debug, info, warning, error, fatal or silent (default: %s)\n", string(levels.LevelInfo)) h += "\nCONFIGURATION:\n" - h += fmt.Sprintf(" -c, --configuration string configuration file path (default: %s)\n", defaultYAMLConfigFile) + h += fmt.Sprintf(" -c, --configuration string configuration file path (default: %s)\n", defaultYAMLConfigFile) fmt.Fprintln(os.Stderr, h) } @@ -127,8 +129,8 @@ func main() { // List suported sources if listSources { - hqgolog.Info().Msgf("listing %v current supported sources", au.Underline(strconv.Itoa(len(config.Sources))).Bold()) - hqgolog.Info().Msgf("sources with %v needs a key or token", au.Underline("*").Bold()) + hqgolog.Info().Msgf("listing, %v, current supported sources.", au.Underline(strconv.Itoa(len(config.Sources))).Bold()) + hqgolog.Info().Msgf("sources marked with %v need key(s) or token(s) to work.", au.Underline("*").Bold()) hqgolog.Print().Msg("") needsKey := make(map[string]interface{}) From 3d939481249c6df9eb168f164a7a072993e0f7a1 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 13:49:48 +0300 Subject: [PATCH 08/24] *: A bulk commit (not recommended) feat: Handle bulk domains refactor: Github source --- cmd/xurlfind3r/main.go | 228 +++++++++++++----- go.mod | 11 +- go.sum | 27 ++- pkg/xurlfind3r/httpclient/client.go | 8 +- pkg/xurlfind3r/sources/bevigil/bevigil.go | 21 +- .../sources/commoncrawl/commoncrawl.go | 36 ++- pkg/xurlfind3r/sources/configuration.go | 9 - pkg/xurlfind3r/sources/github/github.go | 206 +++++----------- pkg/xurlfind3r/sources/github/utils.go | 120 +++++++++ pkg/xurlfind3r/sources/intelx/intelx.go | 54 ++--- pkg/xurlfind3r/sources/otx/otx.go | 25 +- pkg/xurlfind3r/sources/source.go | 2 +- pkg/xurlfind3r/sources/urlscan/urlscan.go | 35 +-- pkg/xurlfind3r/sources/utils.go | 2 - pkg/xurlfind3r/sources/wayback/wayback.go | 24 +- .../sources/wayback/waybacksource.go | 37 ++- pkg/xurlfind3r/xurlfind3r.go | 13 +- 17 files changed, 483 insertions(+), 375 deletions(-) create mode 100644 pkg/xurlfind3r/sources/github/utils.go diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 165148a..013a1c0 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -8,12 +8,14 @@ import ( "reflect" "strconv" "strings" + "sync" "github.com/hueristiq/hqgolog" "github.com/hueristiq/hqgolog/formatter" "github.com/hueristiq/hqgolog/levels" "github.com/hueristiq/xurlfind3r/internal/configuration" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/logrusorgru/aurora/v3" "github.com/spf13/pflag" ) @@ -21,37 +23,44 @@ import ( var ( au aurora.Aurora - domain string - includeSubdomains bool - listSources bool - sourcesToUse []string - sourcesToExclude []string - parseWaybackRobots bool - parseWaybackSource bool - filterPattern string - matchPattern string - monochrome bool - output string - verbosity string - YAMLConfigFile string + domainsSlice []string + domainsListFilePath string + includeSubdomains bool + listSources bool + sourcesToUse []string + sourcesToExclude []string + parseWaybackRobots bool + parseWaybackSource bool + threads int + filterPattern string + matchPattern string + monochrome bool + output string + outputDirectory string + verbosity string + YAMLConfigFile string ) func init() { // defaults + defaultThreads := 50 defaultYAMLConfigFile := fmt.Sprintf("~/.hueristiq/%s/config.yaml", configuration.NAME) // Handle CLI arguments, flags & help message (pflag) - pflag.StringVarP(&domain, "domain", "d", "", "") + pflag.StringSliceVarP(&domainsSlice, "domain", "d", []string{}, "") + pflag.StringVarP(&domainsListFilePath, "list", "l", "", "") pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") pflag.BoolVar(&listSources, "sources", false, "") pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", []string{}, "") pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") + pflag.IntVarP(&threads, "threads", "t", defaultThreads, "") pflag.StringVarP(&filterPattern, "filter", "f", "", "") pflag.StringVarP(&matchPattern, "match", "m", "", "") pflag.BoolVar(&monochrome, "no-color", false, "") pflag.StringVarP(&output, "output", "o", "", "") + pflag.StringVarP(&outputDirectory, "outputDirectory", "O", "", "") pflag.StringVarP(&verbosity, "verbosity", "v", string(levels.LevelInfo), "") pflag.StringVarP(&YAMLConfigFile, "configuration", "c", defaultYAMLConfigFile, "") @@ -63,7 +72,8 @@ func init() { h += " xurlfind3r [OPTIONS]\n" h += "\nINPUT:\n" - h += " -d, --domain string domain to match URLs\n" + h += " -d, --domain string[] target domains\n" + h += " -l, --list string target domains' list file path\n" h += "\nSCOPE:\n" h += " --include-subdomains bool match subdomain's URLs\n" @@ -75,6 +85,9 @@ func init() { h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" + h += "\nOPTIMIZATION:\n" + h += fmt.Sprintf(" -t, --threads int number of threads (default: %d)\n", defaultThreads) + h += "\nFILTER & MATCH:\n" h += " -f, --filter string regex to filter URLs\n" h += " -m, --match string regex to match URLs\n" @@ -82,6 +95,7 @@ func init() { h += "\nOUTPUT:\n" h += " --no-color bool disable colored output\n" h += " -o, --output string output URLs file path\n" + h += " -O, --output-directory string output URLs directory path\n" h += fmt.Sprintf(" -v, --verbosity string debug, info, warning, error, fatal or silent (default: %s)\n", string(levels.LevelInfo)) h += "\nCONFIGURATION:\n" @@ -153,77 +167,163 @@ func main() { os.Exit(0) } - // Find URLs - if verbosity != string(levels.LevelSilent) { - hqgolog.Info().Msgf("finding URLs for %v.", au.Underline(domain).Bold()) + domains := make(chan string, threads) - if includeSubdomains { - hqgolog.Info().Msg("`--include-subdomains` used: match subdomain's URLs.") + // Load input domains + go func() { + defer close(domains) + + // input domains: slice + for _, domain := range domainsSlice { + domains <- domain } - hqgolog.Print().Msg("") - } + // input domains: file + if domainsListFilePath != "" { + file, err := os.Open(domainsListFilePath) + if err != nil { + hqgolog.Error().Msg(err.Error()) + } - options := &xurlfind3r.Options{ - Domain: domain, - IncludeSubdomains: includeSubdomains, - SourcesToUSe: sourcesToUse, - SourcesToExclude: sourcesToExclude, - Keys: config.Keys, - ParseWaybackRobots: parseWaybackRobots, - ParseWaybackSource: parseWaybackSource, - FilterPattern: filterPattern, - Matchattern: matchPattern, - } + scanner := bufio.NewScanner(file) - finder, err := xurlfind3r.New(options) - if err != nil { - hqgolog.Fatal().Msg(err.Error()) - } + for scanner.Scan() { + domain := scanner.Text() - URLs := finder.Find() + if domain != "" { + domains <- domain + } + } - if output != "" { - // Create output file path directory - directory := filepath.Dir(output) + if err := scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) + } + } - if _, err := os.Stat(directory); os.IsNotExist(err) { - if err = os.MkdirAll(directory, os.ModePerm); err != nil { - hqgolog.Fatal().Msg(err.Error()) + // input domains: stdin + if hasStdin() { + scanner := bufio.NewScanner(os.Stdin) + + for scanner.Scan() { + domain := scanner.Text() + + if domain != "" { + domains <- domain + } + } + + if err := scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) } } + }() + + // Find and output URLs. + var consolidatedWriter *bufio.Writer - // Create output file - file, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if output != "" { + directory := filepath.Dir(output) + + mkdir(directory) + + consolidatedFile, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { hqgolog.Fatal().Msg(err.Error()) } - defer file.Close() + defer consolidatedFile.Close() - // Write URLs output file and print on screen - writer := bufio.NewWriter(file) + consolidatedWriter = bufio.NewWriter(consolidatedFile) + } - for URL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqgolog.Print().Msg(URL.Value) - } else { - hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) + if outputDirectory != "" { + mkdir(outputDirectory) + } + + wg := &sync.WaitGroup{} + + for i := 0; i < threads; i++ { + wg.Add(1) + + go func() { + defer wg.Done() + + options := &xurlfind3r.Options{ + IncludeSubdomains: includeSubdomains, + SourcesToUSe: sourcesToUse, + SourcesToExclude: sourcesToExclude, + Keys: config.Keys, + ParseWaybackRobots: parseWaybackRobots, + ParseWaybackSource: parseWaybackSource, + FilterPattern: filterPattern, + Matchattern: matchPattern, } - fmt.Fprintln(writer, URL.Value) - } + finder, err := xurlfind3r.New(options) + if err != nil { + hqgolog.Error().Msg(err.Error()) + + return + } + + for domain := range domains { + URLs := finder.Find(domain) + + switch { + case output != "": + processURLs(consolidatedWriter, URLs, verbosity) + case outputDirectory != "": + domainFile, err := os.OpenFile(filepath.Join(outputDirectory, domain+".txt"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + hqgolog.Fatal().Msg(err.Error()) + } - if err = writer.Flush(); err != nil { + domainWriter := bufio.NewWriter(domainFile) + + processURLs(domainWriter, URLs, verbosity) + default: + processURLs(nil, URLs, verbosity) + } + } + }() + } + + wg.Wait() +} + +func hasStdin() bool { + stat, err := os.Stdin.Stat() + if err != nil { + return false + } + + isPipedFromChrDev := (stat.Mode() & os.ModeCharDevice) == 0 + isPipedFromFIFO := (stat.Mode() & os.ModeNamedPipe) != 0 + + return isPipedFromChrDev || isPipedFromFIFO +} + +func mkdir(path string) { + if _, err := os.Stat(path); os.IsNotExist(err) { + if err = os.MkdirAll(path, os.ModePerm); err != nil { hqgolog.Fatal().Msg(err.Error()) } - } else { - // Print URLs on screen - for URL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqgolog.Print().Msg(URL.Value) - } else { - hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) + } +} + +func processURLs(writer *bufio.Writer, URLs chan sources.URL, verbosity string) { + for URL := range URLs { + if verbosity == string(levels.LevelSilent) { + hqgolog.Print().Msg(URL.Value) + } else { + hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) + } + + if writer != nil { + fmt.Fprintln(writer, URL.Value) + + if err := writer.Flush(); err != nil { + hqgolog.Fatal().Msg(err.Error()) } } } diff --git a/go.mod b/go.mod index 295c35b..beacac2 100644 --- a/go.mod +++ b/go.mod @@ -5,9 +5,10 @@ go 1.20 require ( dario.cat/mergo v1.0.0 github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 - github.com/hueristiq/hqgolog v0.0.0-20230623101640-92de7a10a4bb - github.com/hueristiq/hqgourl v0.0.0-20230623095947-4dee5ebb9a96 + github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f + github.com/hueristiq/hqgourl v0.0.0-20230724201234-90b0b363ac90 github.com/logrusorgru/aurora/v3 v3.0.0 + github.com/spf13/cast v1.5.1 github.com/spf13/pflag v1.0.5 github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 github.com/valyala/fasthttp v1.48.0 @@ -18,7 +19,7 @@ require ( github.com/andybalholm/brotli v1.0.5 // indirect github.com/klauspost/compress v1.16.3 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect - golang.org/x/net v0.11.0 // indirect - golang.org/x/sys v0.9.0 // indirect - golang.org/x/term v0.9.0 // indirect + golang.org/x/net v0.12.0 // indirect + golang.org/x/sys v0.10.0 // indirect + golang.org/x/term v0.10.0 // indirect ) diff --git a/go.sum b/go.sum index 1510b43..b3b4676 100644 --- a/go.sum +++ b/go.sum @@ -2,16 +2,23 @@ dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 h1:r4ze6pX8H//X4SJEIcn8wHPgAhaGKEaa44lyHh1epXY= github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8/go.mod h1:CzhJzxz2rv/NMKNz5b4eKFh1epdcED05YTHT32NFyrI= -github.com/hueristiq/hqgolog v0.0.0-20230623101640-92de7a10a4bb h1:DQUVIiWnrTDQ4MP6UJw7/fMkySN+PYonDhlgBh31DDI= -github.com/hueristiq/hqgolog v0.0.0-20230623101640-92de7a10a4bb/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= -github.com/hueristiq/hqgourl v0.0.0-20230623095947-4dee5ebb9a96 h1:oQsID2S7L6dhNVbwkStxesXOMbn7LWfDSyohVbuKJe8= -github.com/hueristiq/hqgourl v0.0.0-20230623095947-4dee5ebb9a96/go.mod h1:8NAT2ECb69qzGf2d/ty0PVE3M3HK/+fXLtri2c47wQE= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= +github.com/hueristiq/hqgourl v0.0.0-20230724201234-90b0b363ac90 h1:Du3nvvMK/KJLiCqY5batXILrljJs/Up8bVNT8QT/3PA= +github.com/hueristiq/hqgourl v0.0.0-20230724201234-90b0b363ac90/go.mod h1:V+4GiyE0z+oPokCZdV/4oDXPM+ofYQH/Mh0nZDhonfQ= github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0PcNQY= github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/logrusorgru/aurora/v3 v3.0.0 h1:R6zcoZZbvVcGMvDCKo45A9U/lzYyzl5NfYIvznmDfE4= github.com/logrusorgru/aurora/v3 v3.0.0/go.mod h1:vsR12bk5grlLvLXAYrBsb5Oc/N+LxAlxggSjiwMnCUc= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/spf13/cast v1.5.1 h1:R+kOtfhWQE6TVQzY+4D7wJLBgkdVasCEFxSUBYBYIlA= +github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 h1:nrZ3ySNYwJbSpD6ce9duiP+QkD3JuLCcWkdaehUS/3Y= @@ -20,12 +27,12 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasthttp v1.48.0 h1:oJWvHb9BIZToTQS3MuQ2R3bJZiNSa2KiNdeI8A+79Tc= github.com/valyala/fasthttp v1.48.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA= -golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU= -golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= -golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= -golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.9.0 h1:GRRCnKYhdQrD8kfRAdQ6Zcw1P0OcELxGLKJvtjVMZ28= -golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo= +golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.10.0 h1:3R7pNqamzBraeqj/Tj8qt1aQ2HpmlC+Cx/qL/7hn4/c= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/pkg/xurlfind3r/httpclient/client.go b/pkg/xurlfind3r/httpclient/client.go index d648501..fc14393 100644 --- a/pkg/xurlfind3r/httpclient/client.go +++ b/pkg/xurlfind3r/httpclient/client.go @@ -1,8 +1,6 @@ package httpclient import ( - "fmt" - "github.com/valyala/fasthttp" ) @@ -17,9 +15,9 @@ func httpRequestWrapper(req *fasthttp.Request) (res *fasthttp.Response, err erro return } - if res.StatusCode() != fasthttp.StatusOK { - err = fmt.Errorf("unexpected status code") - } + // if res.StatusCode() != fasthttp.StatusOK { + // err = fmt.Errorf("unexpected status code") + // } return } diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 363ff6c..13970ad 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -9,24 +9,21 @@ import ( "github.com/valyala/fasthttp" ) -type Response struct { +type response struct { Domain string `json:"domain"` URLs []string `json:"urls"` } type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) - var ( - err error - ) - var key string + var err error key, err = sources.PickRandom(config.Keys.Bevigil) if key == "" || err != nil { @@ -39,7 +36,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc reqHeaders["X-Access-Token"] = key } - reqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", config.Domain) + reqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", domain) var res *fasthttp.Response @@ -48,16 +45,14 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var data Response + var responseData response - if err = json.Unmarshal(res.Body(), &data); err != nil { + if err = json.Unmarshal(res.Body(), &responseData); err != nil { return } - for index := range data.URLs { - URL := data.URLs[index] - - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + for _, URL := range responseData.URLs { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 49c18e5..0ea6158 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -13,60 +13,54 @@ import ( "github.com/valyala/fasthttp" ) -type API struct { +type indexesResponse []struct { ID string `json:"id"` API string `json:"cdx-API"` } -type Response struct { +type response struct { URL string `json:"url"` Error string `json:"error"` } type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) - var ( - err error - ) - var indexesRes *fasthttp.Response + var err error indexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") if err != nil { return } - var APIs []API + var indexesResponseData indexesResponse - if err = json.Unmarshal(indexesRes.Body(), &APIs); err != nil { + if err = json.Unmarshal(indexesRes.Body(), &indexesResponseData); err != nil { return } wg := new(sync.WaitGroup) - for index := range APIs { + for _, indexData := range indexesResponseData { wg.Add(1) - API := APIs[index] - go func(API string) { defer wg.Done() - var ( - err error - ) - - contentReqHeaders := map[string]string{"Host": "index.commoncrawl.org"} + contentReqHeaders := map[string]string{ + "Host": "index.commoncrawl.org", + } var contentRes *fasthttp.Response + var err error - contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, config.Domain), "", contentReqHeaders) + contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, domain), "", contentReqHeaders) if err != nil { return } @@ -74,7 +68,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body())) for scanner.Scan() { - var data Response + var data response if err = json.Unmarshal(scanner.Bytes(), &data); err != nil { return @@ -86,7 +80,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc URL := data.URL - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } @@ -96,7 +90,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc if scanner.Err() != nil { return } - }(API.API) + }(indexData.API) } wg.Wait() diff --git a/pkg/xurlfind3r/sources/configuration.go b/pkg/xurlfind3r/sources/configuration.go index d49b8eb..8c14492 100644 --- a/pkg/xurlfind3r/sources/configuration.go +++ b/pkg/xurlfind3r/sources/configuration.go @@ -1,19 +1,10 @@ package sources -import ( - "regexp" -) - type Configuration struct { - Domain string IncludeSubdomains bool Keys Keys ParseWaybackRobots bool ParseWaybackSource bool - LinkFinderRegex *regexp.Regexp - URLsRegex *regexp.Regexp - MediaURLsRegex *regexp.Regexp - RobotsURLsRegex *regexp.Regexp } type Keys struct { diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 027d9c2..6d34504 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -8,34 +8,30 @@ import ( "fmt" "net/url" "regexp" - "strconv" - "strings" "time" + "github.com/hueristiq/hqgourl" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" + "github.com/spf13/cast" "github.com/tomnomnom/linkheader" "github.com/valyala/fasthttp" ) -type Response struct { - TotalCount int `json:"total_count"` - Items []Item `json:"items"` -} - -type Item struct { - Name string `json:"name"` - HTMLURL string `json:"html_url"` - TextMatches []TextMatch `json:"text_matches"` -} - -type TextMatch struct { - Fragment string `json:"fragment"` +type response struct { + TotalCount int `json:"total_count"` + Items []struct { + Name string `json:"name"` + HTMLURL string `json:"html_url"` + TextMatches []struct { + Fragment string `json:"fragment"` + } `json:"text_matches"` + } `json:"items"` } type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { @@ -47,15 +43,15 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc tokens := NewTokenManager(config.Keys.GitHub) - searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", config.Domain) + searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", domain) - source.Enumerate(searchURL, config.URLsRegex, tokens, URLsChannel, config) + source.Enumerate(searchURL, domain, tokens, URLsChannel, config) }() return URLsChannel } -func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { +func (source *Source) Enumerate(searchURL string, domain string, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { token := tokens.Get() if token.RetryAfter > 0 { @@ -71,29 +67,39 @@ func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, toke "Authorization": "token " + token.Hash, } - searchRes, err := httpclient.Request(fasthttp.MethodGet, searchURL, "", reqHeaders, nil) + var err error + var searchRes *fasthttp.Response + searchRes, err = httpclient.Request(fasthttp.MethodGet, searchURL, "", reqHeaders, nil) isForbidden := searchRes != nil && searchRes.StatusCode() == fasthttp.StatusForbidden - if err != nil && !isForbidden { return } - ratelimitRemaining, _ := strconv.ParseInt(string(searchRes.Header.Peek("X-Ratelimit-Remaining")), 10, 64) - + ratelimitRemaining := cast.ToInt64(searchRes.Header.Peek("X-Ratelimit-Remaining")) if isForbidden && ratelimitRemaining == 0 { - retryAfterSeconds, _ := strconv.ParseInt(string(searchRes.Header.Peek("Retry-After")), 10, 64) + retryAfterSeconds := cast.ToInt64(searchRes.Header.Peek("Retry-After")) + tokens.setCurrentTokenExceeded(retryAfterSeconds) - source.Enumerate(searchURL, URLsRegex, tokens, URLsChannel, config) + source.Enumerate(searchURL, domain, tokens, URLsChannel, config) } - var searchResData Response + var searchResData response if err = json.Unmarshal(searchRes.Body(), &searchResData); err != nil { return } + var mdExtractor *regexp.Regexp + + // (\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)? + // (?:.*\.)? + mdExtractor, err = hqgourl.Extractor.ModerateMatchHost(`(\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)?` + regexp.QuoteMeta(domain)) + if err != nil { + return + } + // Process Items for index := range searchResData.Items { item := searchResData.Items[index] @@ -119,13 +125,19 @@ func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, toke continue } - URLs := URLsRegex.FindAllString(normalizeContent(line), -1) + URLs := mdExtractor.FindAllString(normalizeContent(line), -1) - for index := range URLs { - URL := URLs[index] + for _, URL := range URLs { URL = fixURL(URL) - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + parsedURL, err := hqgourl.Parse(URL) + if err != nil { + return + } + + URL = parsedURL.String() + + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } @@ -137,16 +149,20 @@ func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, toke return } - for index := range item.TextMatches { - textMatch := item.TextMatches[index] + for _, textMatch := range item.TextMatches { + URLs := mdExtractor.FindAllString(normalizeContent(textMatch.Fragment), -1) - URLs := URLsRegex.FindAllString(normalizeContent(textMatch.Fragment), -1) - - for index := range URLs { - URL := URLs[index] + for _, URL := range URLs { URL = fixURL(URL) - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + parsedURL, err := hqgourl.Parse(URL) + if err != nil { + return + } + + URL = parsedURL.String() + + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } @@ -164,123 +180,9 @@ func (source *Source) Enumerate(searchURL string, URLsRegex *regexp.Regexp, toke return } - source.Enumerate(nextURL, URLsRegex, tokens, URLsChannel, config) - } - } -} - -func getRawContentURL(URL string) (rawContentURL string) { - rawContentURL = URL - rawContentURL = strings.ReplaceAll(rawContentURL, "https://github.com/", "https://raw.githubusercontent.com/") - rawContentURL = strings.ReplaceAll(rawContentURL, "/blob/", "/") - - return -} - -func normalizeContent(content string) (normalizedContent string) { - normalizedContent = content - normalizedContent, _ = url.QueryUnescape(normalizedContent) - normalizedContent = strings.ReplaceAll(normalizedContent, "\\t", "") - normalizedContent = strings.ReplaceAll(normalizedContent, "\\n", "") - - return -} - -func fixURL(URL string) (fixedURL string) { - fixedURL = URL - - // ',",`, - quotes := []rune{'\'', '"', '`'} - - for i := range quotes { - quote := quotes[i] - - indexOfQuote := findUnbalancedQuote(URL, quote) - if indexOfQuote <= len(fixedURL) && indexOfQuote >= 0 { - fixedURL = fixedURL[:indexOfQuote] - } - } - - // (),[],{} - parentheses := []struct { - Opening, Closing rune - }{ - {'[', ']'}, - {'(', ')'}, - {'{', '}'}, - } - - for i := range parentheses { - parenthesis := parentheses[i] - - indexOfParenthesis := findUnbalancedBracket(URL, parenthesis.Opening, parenthesis.Closing) - if indexOfParenthesis <= len(fixedURL) && indexOfParenthesis >= 0 { - fixedURL = fixedURL[:indexOfParenthesis] - } - } - - // ; - indexOfComma := strings.Index(fixedURL, ";") - if indexOfComma <= len(fixedURL) && indexOfComma >= 0 { - fixedURL = fixedURL[:indexOfComma] - } - - return -} - -func findUnbalancedQuote(s string, quoteChar rune) int { - insideQuotes := false - - for _, ch := range s { - if ch == quoteChar { - if insideQuotes { - insideQuotes = false - } else { - insideQuotes = true - } - } - } - - // If still inside quotes at the end of the string, - // find the index of the opening quote - if insideQuotes { - for i, ch := range s { - if ch == quoteChar { - return i - } - } - } - - return -1 // return -1 if all quotes are balanced -} - -func findUnbalancedBracket(s string, openChar, closeChar rune) int { - openCount := 0 - - var firstOpenIndex int - - for i, ch := range s { - if ch == openChar { - if openCount == 0 { - firstOpenIndex = i - } - - openCount++ - } else if ch == closeChar { - openCount-- - - if openCount < 0 { - return i // Found an unbalanced closing bracket - } + source.Enumerate(nextURL, domain, tokens, URLsChannel, config) } } - - // If there are unmatched opening brackets - if openCount > 0 { - return firstOpenIndex - } - - return -1 // All brackets are balanced } func (source *Source) Name() string { diff --git a/pkg/xurlfind3r/sources/github/utils.go b/pkg/xurlfind3r/sources/github/utils.go new file mode 100644 index 0000000..9787b24 --- /dev/null +++ b/pkg/xurlfind3r/sources/github/utils.go @@ -0,0 +1,120 @@ +package github + +import ( + "net/url" + "strings" +) + +func getRawContentURL(URL string) (rawContentURL string) { + rawContentURL = URL + rawContentURL = strings.ReplaceAll(rawContentURL, "https://github.com/", "https://raw.githubusercontent.com/") + rawContentURL = strings.ReplaceAll(rawContentURL, "/blob/", "/") + + return +} + +func normalizeContent(content string) (normalizedContent string) { + normalizedContent = content + normalizedContent, _ = url.QueryUnescape(normalizedContent) + normalizedContent = strings.ReplaceAll(normalizedContent, "\\t", "") + normalizedContent = strings.ReplaceAll(normalizedContent, "\\n", "") + + return +} + +func fixURL(URL string) (fixedURL string) { + fixedURL = URL + + // ',",`, + quotes := []rune{'\'', '"', '`'} + + for i := range quotes { + quote := quotes[i] + + indexOfQuote := findUnbalancedQuote(URL, quote) + if indexOfQuote <= len(fixedURL) && indexOfQuote >= 0 { + fixedURL = fixedURL[:indexOfQuote] + } + } + + // (),[],{} + parentheses := []struct { + Opening, Closing rune + }{ + {'[', ']'}, + {'(', ')'}, + {'{', '}'}, + } + + for i := range parentheses { + parenthesis := parentheses[i] + + indexOfParenthesis := findUnbalancedBracket(URL, parenthesis.Opening, parenthesis.Closing) + if indexOfParenthesis <= len(fixedURL) && indexOfParenthesis >= 0 { + fixedURL = fixedURL[:indexOfParenthesis] + } + } + + // ; + indexOfComma := strings.Index(fixedURL, ";") + if indexOfComma <= len(fixedURL) && indexOfComma >= 0 { + fixedURL = fixedURL[:indexOfComma] + } + + return +} + +func findUnbalancedQuote(s string, quoteChar rune) int { + insideQuotes := false + + for _, ch := range s { + if ch == quoteChar { + if insideQuotes { + insideQuotes = false + } else { + insideQuotes = true + } + } + } + + // If still inside quotes at the end of the string, + // find the index of the opening quote + if insideQuotes { + for i, ch := range s { + if ch == quoteChar { + return i + } + } + } + + return -1 // return -1 if all quotes are balanced +} + +func findUnbalancedBracket(s string, openChar, closeChar rune) int { + openCount := 0 + + var firstOpenIndex int + + for i, ch := range s { + if ch == openChar { + if openCount == 0 { + firstOpenIndex = i + } + + openCount++ + } else if ch == closeChar { + openCount-- + + if openCount < 0 { + return i // Found an unbalanced closing bracket + } + } + } + + // If there are unmatched opening brackets + if openCount > 0 { + return firstOpenIndex + } + + return -1 // All brackets are balanced +} diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index 700dd14..29812ce 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -13,40 +13,34 @@ import ( "github.com/valyala/fasthttp" ) -type SearchResponse struct { - ID string `json:"id"` - Status int `json:"status"` -} - -type searchResultType struct { - Selectors []selectorType `json:"selectors"` - Status int `json:"status"` -} - -type selectorType struct { - Selectvalue string `json:"selectorvalue"` -} - -type requestBody struct { +type searchRequest struct { Term string `json:"term"` Timeout time.Duration `json:"timeout"` MaxResults int `json:"maxresults"` Media int `json:"media"` } +type searchResponse struct { + ID string `json:"id"` + Status int `json:"status"` +} + +type resultsResponse struct { + Selectors []struct { + Selectvalue string `json:"selectorvalue"` + } `json:"selectors"` + Status int `json:"status"` +} type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) - var ( - err error - ) - var key string + var err error key, err = sources.PickRandom(config.Keys.Intelx) if key == "" || err != nil { @@ -66,8 +60,8 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc } searchURL := fmt.Sprintf("https://%s/phonebook/search?k=%s", intelXHost, intelXKey) - searchReqBody := requestBody{ - Term: config.Domain, + searchReqBody := searchRequest{ + Term: domain, MaxResults: 100000, Media: 0, Timeout: 20, @@ -87,13 +81,13 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var resData SearchResponse + var searchResponseData searchResponse - if err = json.Unmarshal(res.Body(), &resData); err != nil { + if err = json.Unmarshal(res.Body(), &searchResponseData); err != nil { return } - resultsURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, resData.ID) + resultsURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, searchResponseData.ID) status := 0 for status == 0 || status == 3 { @@ -102,22 +96,22 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var resData searchResultType + var resultsResponseData resultsResponse - if err = json.Unmarshal(res.Body(), &resData); err != nil { + if err = json.Unmarshal(res.Body(), &resultsResponseData); err != nil { return } - status = resData.Status + status = resultsResponseData.Status - for _, hostname := range resData.Selectors { + for _, hostname := range resultsResponseData.Selectors { URL := hostname.Selectvalue if isEmail(URL) { continue } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index 2a14e70..bde9baf 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -10,7 +10,7 @@ import ( "github.com/valyala/fasthttp" ) -type Response struct { +type response struct { URLList []struct { Domain string `json:"domain"` URL string `json:"url"` @@ -29,43 +29,38 @@ type Response struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) for page := 1; ; page++ { - var ( - err error - ) - - reqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", config.Domain, 200, page) + reqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", domain, 200, page) var res *fasthttp.Response + var err error res, err = httpclient.SimpleGet(reqURL) if err != nil { return } - var resData Response + var responseData response - if err = json.Unmarshal(res.Body(), &resData); err != nil { + if err = json.Unmarshal(res.Body(), &responseData); err != nil { return } - for index := range resData.URLList { - URL := resData.URLList[index].URL - - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + for _, URL := range responseData.URLList { + if !sources.IsInScope(URL.URL, domain, config.IncludeSubdomains) { return } - URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + URLsChannel <- sources.URL{Source: source.Name(), Value: URL.URL} } - if !resData.HasNext { + if !responseData.HasNext { break } } diff --git a/pkg/xurlfind3r/sources/source.go b/pkg/xurlfind3r/sources/source.go index 9730199..f4d4340 100644 --- a/pkg/xurlfind3r/sources/source.go +++ b/pkg/xurlfind3r/sources/source.go @@ -1,6 +1,6 @@ package sources type Source interface { - Run(config *Configuration) (URLs chan URL) + Run(config *Configuration, domain string) (URLs chan URL) Name() string } diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index ad56bf1..3e352c0 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -11,7 +11,7 @@ import ( "github.com/valyala/fasthttp" ) -type Response struct { +type response struct { Results []struct { Page struct { Domain string `json:"domain"` @@ -29,14 +29,17 @@ type Response struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { defer close(URLsChannel) - key, err := sources.PickRandom(config.Keys.URLScan) - if key == "" || err != nil { + var key string + var err error + + key, err = sources.PickRandom(config.Keys.URLScan) + if err != nil { return } @@ -44,7 +47,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc "Content-Type": "application/json", } - if len(config.Keys.URLScan) > 0 { + if key != "" { reqHeaders["API-Key"] = key } @@ -53,7 +56,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc for { baseURL := "https://urlscan.io/api/v1/search/" params := url.Values{} - params.Set("q", config.Domain) + params.Set("q", domain) if searchAfter != nil { searchAfterJSON, _ := json.Marshal(searchAfter) @@ -69,36 +72,36 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - var resData Response + var responseData response - if err = json.Unmarshal(res.Body(), &resData); err != nil { + if err = json.Unmarshal(res.Body(), &responseData); err != nil { return } - if resData.Status == 429 { + if responseData.Status == 429 { break } - for index := range resData.Results { - URL := resData.Results[index].Page.URL + for _, result := range responseData.Results { + URL := result.Page.URL - if resData.Results[index].Page.Domain != config.Domain || - !strings.HasSuffix(resData.Results[index].Page.Domain, config.Domain) { + if result.Page.Domain != domain || + !strings.HasSuffix(result.Page.Domain, domain) { continue } - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } - if !resData.HasMore { + if !responseData.HasMore { break } - lastResult := resData.Results[len(resData.Results)-1] + lastResult := responseData.Results[len(responseData.Results)-1] searchAfter = lastResult.Sort } }() diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 182b19f..1ded7c8 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -15,7 +15,6 @@ func PickRandom[T any](v []T) (picked T, err error) { return } - // Generate a cryptographically secure random index max := big.NewInt(int64(length)) var indexBig *big.Int @@ -29,7 +28,6 @@ func PickRandom[T any](v []T) (picked T, err error) { index := indexBig.Int64() - // Return the element at the random index picked = v[index] return diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go index 09f3518..ccabc65 100644 --- a/pkg/xurlfind3r/sources/wayback/wayback.go +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -6,6 +6,7 @@ import ( "bytes" "encoding/json" "fmt" + "regexp" "strings" "sync" @@ -23,7 +24,7 @@ var ( }) ) -func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sources.URL) { +func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) go func() { @@ -41,10 +42,10 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc ) if config.IncludeSubdomains { - config.Domain = "*." + config.Domain + domain = "*." + domain } - results, err = getWaybackURLs(config.Domain) + results, err = getWaybackURLs(domain) if err != nil { return } @@ -59,6 +60,9 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc } }() + mediaURLRegex := regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf)(?:\?|#|$)`) + robotsURLsRegex := regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`) + // Process wayback Snapshots wg := &sync.WaitGroup{} @@ -68,7 +72,7 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc go func(URL string) { defer wg.Done() - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } @@ -78,23 +82,23 @@ func (source *Source) Run(config *sources.Configuration) (URLsChannel chan sourc return } - if config.MediaURLsRegex.MatchString(URL) { + if mediaURLRegex.MatchString(URL) { return } if config.ParseWaybackRobots && - config.RobotsURLsRegex.MatchString(URL) { + robotsURLsRegex.MatchString(URL) { for robotsURL := range parseWaybackRobots(config, URL) { - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } URLsChannel <- sources.URL{Source: source.Name() + ":robots", Value: robotsURL} } } else if config.ParseWaybackSource && - !config.RobotsURLsRegex.MatchString(URL) { - for sourceURL := range parseWaybackSource(config, URL) { - if !sources.IsInScope(URL, config.Domain, config.IncludeSubdomains) { + !robotsURLsRegex.MatchString(URL) { + for sourceURL := range parseWaybackSource(domain, URL) { + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } diff --git a/pkg/xurlfind3r/sources/wayback/waybacksource.go b/pkg/xurlfind3r/sources/wayback/waybacksource.go index c95e1b1..e288574 100644 --- a/pkg/xurlfind3r/sources/wayback/waybacksource.go +++ b/pkg/xurlfind3r/sources/wayback/waybacksource.go @@ -3,26 +3,36 @@ package wayback import ( "fmt" "mime" + "regexp" "strings" "sync" "github.com/hueristiq/hqgourl" - "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) -func parseWaybackSource(config *sources.Configuration, URL string) (sourceURLs chan string) { +func parseWaybackSource(domain, URL string) (sourceURLs chan string) { sourceURLs = make(chan string) go func() { defer close(sourceURLs) - // retrieve snapshots - snapshots, err := getWaybackSnapshots(URL) + var err error + var snapshots [][2]string + + snapshots, err = getWaybackSnapshots(URL) + if err != nil { + return + } + + lxExtractor := hqgourl.Extractor.Relaxed() + + var mdExtractor *regexp.Regexp + + mdExtractor, err = hqgourl.Extractor.ModerateMatchHost(`(\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)?` + regexp.QuoteMeta(domain)) if err != nil { return } - // retrieve and parse snapshots' content for robotsURLs wg := &sync.WaitGroup{} for index := range snapshots { @@ -38,10 +48,11 @@ func parseWaybackSource(config *sources.Configuration, URL string) (sourceURLs c return } - links := config.LinkFinderRegex.FindAllString(content, -1) + links := lxExtractor.FindAllString(content, -1) for index := range links { sourceURL := links[index] + // remove beginning and ending quotes sourceURL = strings.Trim(sourceURL, "\"") sourceURL = strings.Trim(sourceURL, "'") @@ -60,10 +71,10 @@ func parseWaybackSource(config *sources.Configuration, URL string) (sourceURLs c } if parsedSourceURL.IsAbs() { - matches := config.URLsRegex.FindAllString(sourceURL, -1) + URLs := mdExtractor.FindAllString(sourceURL, -1) - for _, match := range matches { - sourceURLs <- match + for _, URL := range URLs { + sourceURLs <- URL } } else { _, _, err := mime.ParseMediaType(sourceURL) @@ -71,13 +82,13 @@ func parseWaybackSource(config *sources.Configuration, URL string) (sourceURLs c continue } - matches := config.URLsRegex.FindAllString(sourceURL, -1) + URLs := mdExtractor.FindAllString(sourceURL, -1) - for _, match := range matches { - sourceURLs <- match + for _, URL := range URLs { + sourceURLs <- URL } - if len(matches) > 0 { + if len(URLs) > 0 { continue } diff --git a/pkg/xurlfind3r/xurlfind3r.go b/pkg/xurlfind3r/xurlfind3r.go index 3010a42..ffae124 100644 --- a/pkg/xurlfind3r/xurlfind3r.go +++ b/pkg/xurlfind3r/xurlfind3r.go @@ -1,7 +1,6 @@ package xurlfind3r import ( - "fmt" "regexp" "sync" @@ -16,7 +15,6 @@ import ( ) type Options struct { - Domain string IncludeSubdomains bool SourcesToUSe []string SourcesToExclude []string @@ -38,15 +36,10 @@ func New(options *Options) (finder *Finder, err error) { finder = &Finder{ Sources: map[string]sources.Source{}, SourcesConfiguration: &sources.Configuration{ - Domain: options.Domain, IncludeSubdomains: options.IncludeSubdomains, Keys: options.Keys, ParseWaybackRobots: options.ParseWaybackRobots, ParseWaybackSource: options.ParseWaybackSource, - LinkFinderRegex: regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`), //nolint:gocritic // Works so far - URLsRegex: regexp.MustCompile(fmt.Sprintf(`https?://(?:[\w.-]+\.)?%s(?:/[\w.-]*)*(?:\?[^\s#]*)?(?:#[^\s]*)?`, regexp.QuoteMeta(options.Domain))), - MediaURLsRegex: regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf)(?:\?|#|$)`), - RobotsURLsRegex: regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`), }, } @@ -100,7 +93,7 @@ func New(options *Options) (finder *Finder, err error) { return } -func (finder *Finder) Find() (URLs chan sources.URL) { +func (finder *Finder) Find(domain string) (URLs chan sources.URL) { URLs = make(chan sources.URL) go func() { @@ -115,7 +108,9 @@ func (finder *Finder) Find() (URLs chan sources.URL) { go func(source sources.Source) { defer wg.Done() - for URL := range source.Run(finder.SourcesConfiguration) { + results := source.Run(finder.SourcesConfiguration, domain) + + for URL := range results { value := URL.Value _, loaded := seen.LoadOrStore(value, struct{}{}) From a75ea1dcca93ed9f0fb0b140f7d66467a5477190 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 14:21:54 +0300 Subject: [PATCH 09/24] chore: - --- cmd/xurlfind3r/main.go | 106 +++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 37 deletions(-) diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 013a1c0..9e2a6a1 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -130,21 +130,21 @@ func init() { } func main() { - // Print Banner + // Print Banner. if verbosity != string(levels.LevelSilent) { fmt.Fprintln(os.Stderr, configuration.BANNER) } - // Read in configuration + // Read in configuration. config, err := configuration.Read(YAMLConfigFile) if err != nil { hqgolog.Fatal().Msg(err.Error()) } - // List suported sources + // If listSources: List suported sources & exit. if listSources { hqgolog.Info().Msgf("listing, %v, current supported sources.", au.Underline(strconv.Itoa(len(config.Sources))).Bold()) - hqgolog.Info().Msgf("sources marked with %v need key(s) or token(s) to work.", au.Underline("*").Bold()) + hqgolog.Info().Msgf("sources marked with %v take in key(s) or token(s).", au.Underline("*").Bold()) hqgolog.Print().Msg("") needsKey := make(map[string]interface{}) @@ -154,11 +154,11 @@ func main() { needsKey[strings.ToLower(keysElem.Type().Field(i).Name)] = keysElem.Field(i).Interface() } - for _, source := range config.Sources { + for i, source := range config.Sources { if _, ok := needsKey[source]; ok { - hqgolog.Print().Msgf("> %s *", source) + hqgolog.Print().Msgf("%d. %s *", i+1, source) } else { - hqgolog.Print().Msgf("> %s", source) + hqgolog.Print().Msgf("%d. %s", i+1, source) } } @@ -167,55 +167,85 @@ func main() { os.Exit(0) } + // Load input domains. domains := make(chan string, threads) - // Load input domains go func() { defer close(domains) + wg := &sync.WaitGroup{} + // input domains: slice - for _, domain := range domainsSlice { - domains <- domain + if len(domainsSlice) > 0 { + wg.Add(1) + + go func() { + defer wg.Done() + + for _, domain := range domainsSlice { + domains <- domain + } + }() } // input domains: file if domainsListFilePath != "" { - file, err := os.Open(domainsListFilePath) - if err != nil { - hqgolog.Error().Msg(err.Error()) - } + wg.Add(1) - scanner := bufio.NewScanner(file) + go func() { + defer wg.Done() - for scanner.Scan() { - domain := scanner.Text() + file, err := os.Open(domainsListFilePath) + if err != nil { + hqgolog.Error().Msg(err.Error()) - if domain != "" { - domains <- domain + return } - } - if err := scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) - } + scanner := bufio.NewScanner(file) + + for scanner.Scan() { + domain := scanner.Text() + + if domain != "" { + domains <- domain + } + } + + if err := scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) + + return + } + }() } // input domains: stdin if hasStdin() { - scanner := bufio.NewScanner(os.Stdin) + wg.Add(1) - for scanner.Scan() { - domain := scanner.Text() + go func() { + defer wg.Done() - if domain != "" { - domains <- domain + scanner := bufio.NewScanner(os.Stdin) + + for scanner.Scan() { + domain := scanner.Text() + + if domain != "" { + domains <- domain + } } - } - if err := scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) - } + if err := scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) + + return + } + }() } + + wg.Wait() }() // Find and output URLs. @@ -271,18 +301,20 @@ func main() { switch { case output != "": - processURLs(consolidatedWriter, URLs, verbosity) + outputURLs(consolidatedWriter, URLs, verbosity) case outputDirectory != "": domainFile, err := os.OpenFile(filepath.Join(outputDirectory, domain+".txt"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { - hqgolog.Fatal().Msg(err.Error()) + hqgolog.Error().Msg(err.Error()) + + return } domainWriter := bufio.NewWriter(domainFile) - processURLs(domainWriter, URLs, verbosity) + outputURLs(domainWriter, URLs, verbosity) default: - processURLs(nil, URLs, verbosity) + outputURLs(nil, URLs, verbosity) } } }() @@ -311,7 +343,7 @@ func mkdir(path string) { } } -func processURLs(writer *bufio.Writer, URLs chan sources.URL, verbosity string) { +func outputURLs(writer *bufio.Writer, URLs chan sources.URL, verbosity string) { for URL := range URLs { if verbosity == string(levels.LevelSilent) { hqgolog.Print().Msg(URL.Value) From c006d8efad984b9d7f0a781c3ee82442f01f92a2 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 14:31:51 +0300 Subject: [PATCH 10/24] docs: Update README.md --- README.md | 59 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 588be10..466da65 100644 --- a/README.md +++ b/README.md @@ -22,18 +22,12 @@ ## Features -* Fetches URLs from curated passive sources to maximize results: - * **[AlienVault's OTX](https://otx.alienvault.com/)** - * **[BeVigil](https://bevigil.com)** - * **[Common Crawl](https://commoncrawl.org/)** - * **[Github](https://github.com)** - * **[Intelligence X](https://intelx.io)** - * **[URLScan](https://urlscan.io/)** - * **[Wayback Machine](https://archive.org/web/)** -* With Wayback Machine, Parses URLs from `robots.txt` snapshots. -* With Wayback Machine, Parses URLs from webpages' snapshots. -* Cross-Platform (Windows, Linux & macOS) -* Supports URLs match and filter +* Fetches URLs from curated passive sources to maximize results. + * **[AlienVault's OTX](https://otx.alienvault.com/)** â—‡ **[BeVigil](https://bevigil.com)** â—‡ **[Common Crawl](https://commoncrawl.org/)** â—‡ **[Github](https://github.com)** â—‡ **[Intelligence X](https://intelx.io)** â—‡ **[URLScan](https://urlscan.io/)** â—‡ **[Wayback Machine](https://archive.org/web/)** +* Parses URLs from wayback webpages and `robots.txt` snapshots. +* Supports URLs matching and filtering. +* Supports `stdin` and `stdout` for easy integration into workflows. +* Cross-Platform (Windows, Linux & macOS). ## Installation @@ -159,30 +153,35 @@ __ ___ _ _ __| |/ _(_)_ __ __| |___ / _ __ USAGE: xurlfind3r [OPTIONS] -TARGET: - -d, --domain string domain to match URLs - --include-subdomains bool match subdomain's URLs +INPUT: + -d, --domain string[] target domains + -l, --list string target domains' list file path + +SCOPE: + --include-subdomains bool match subdomain's URLs SOURCES: - -s, --sources bool list supported sources - -u, --use-sources strings comma(,) separated sources to use - -e, --exclude-sources strings comma(,) separated sources to exclude - --parse-wayback-robots bool with wayback, parse robots.txt snapshots - --parse-wayback-source bool with wayback, parse source code snapshots + --sources bool list supported sources + -u, --use-sources string[] comma(,) separated sources to use + -e, --exclude-sources string[] comma(,) separated sources to exclude + --parse-wayback-robots bool with wayback, parse robots.txt snapshots + --parse-wayback-source bool with wayback, parse source code snapshots + +OPTIMIZATION: + -t, --threads int number of threads (default: 50) FILTER & MATCH: - -f, --filter string regex to filter URLs - -m, --match string regex to match URLs + -f, --filter string regex to filter URLs + -m, --match string regex to match URLs OUTPUT: - --no-color bool no color mode - -o, --output string output URLs file path - -v, --verbosity string debug, info, warning, error, fatal or silent (default: info) + --no-color bool disable colored output + -o, --output string output URLs file path + -O, --output-directory string output URLs directory path + -v, --verbosity string debug, info, warning, error, fatal or silent (default: info) CONFIGURATION: - -c, --configuration string configuration file path (default: ~/.hueristiq/xurlfind3r/config.yaml) - -pflag: help requested + -c, --configuration string configuration file path (default: ~/.hueristiq/xurlfind3r/config.yaml) ``` ### Examples @@ -209,8 +208,8 @@ xurlfind3r -d hackerone.com --include-subdomains -m '^https?://[^/]*?/.*\.js(\?[ ## Contributing -[Issues](https://github.com/hueristiq/xurlfind3r/issues) and [Pull Requests](https://github.com/hueristiq/xurlfind3r/pulls) are welcome! **Check out the [contribution guidelines](./CONTRIBUTING.md).** +[Issues](https://github.com/hueristiq/xurlfind3r/issues) and [Pull Requests](https://github.com/hueristiq/xurlfind3r/pulls) are welcome! **Check out the [contribution guidelines](https://github.com/hueristiq/xurlfind3r/blob/master/CONTRIBUTING.md).** ## Licensing -This utility is distributed under the [MIT license](./LICENSE). \ No newline at end of file +This utility is distributed under the [MIT license](https://github.com/hueristiq/xurlfind3r/blob/master/LICENSE). \ No newline at end of file From d0bcea6107c5a134aa1606d1bb8111a78d9320ef Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 14:38:40 +0300 Subject: [PATCH 11/24] chore: - --- pkg/xurlfind3r/httpclient/client.go | 4 ---- pkg/xurlfind3r/sources/bevigil/bevigil.go | 3 ++- pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go | 6 ++++-- pkg/xurlfind3r/sources/github/github.go | 5 ++++- pkg/xurlfind3r/sources/intelx/intelx.go | 3 ++- pkg/xurlfind3r/sources/otx/otx.go | 3 ++- pkg/xurlfind3r/sources/urlscan/urlscan.go | 3 ++- pkg/xurlfind3r/sources/wayback/waybackrobots.go | 2 +- pkg/xurlfind3r/sources/wayback/waybacksource.go | 1 + 9 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pkg/xurlfind3r/httpclient/client.go b/pkg/xurlfind3r/httpclient/client.go index fc14393..01cdc15 100644 --- a/pkg/xurlfind3r/httpclient/client.go +++ b/pkg/xurlfind3r/httpclient/client.go @@ -15,10 +15,6 @@ func httpRequestWrapper(req *fasthttp.Request) (res *fasthttp.Response, err erro return } - // if res.StatusCode() != fasthttp.StatusOK { - // err = fmt.Errorf("unexpected status code") - // } - return } diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 13970ad..4433739 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -22,9 +22,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - var key string var err error + var key string + key, err = sources.PickRandom(config.Keys.Bevigil) if key == "" || err != nil { return diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 0ea6158..8fc9c7b 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -31,9 +31,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - var indexesRes *fasthttp.Response var err error + var indexesRes *fasthttp.Response + indexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") if err != nil { return @@ -57,9 +58,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha "Host": "index.commoncrawl.org", } - var contentRes *fasthttp.Response var err error + var contentRes *fasthttp.Response + contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, domain), "", contentReqHeaders) if err != nil { return diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 6d34504..4a81277 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -51,7 +51,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return URLsChannel } -func (source *Source) Enumerate(searchURL string, domain string, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { +func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { token := tokens.Get() if token.RetryAfter > 0 { @@ -68,10 +68,13 @@ func (source *Source) Enumerate(searchURL string, domain string, tokens *Tokens, } var err error + var searchRes *fasthttp.Response searchRes, err = httpclient.Request(fasthttp.MethodGet, searchURL, "", reqHeaders, nil) + isForbidden := searchRes != nil && searchRes.StatusCode() == fasthttp.StatusForbidden + if err != nil && !isForbidden { return } diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index 29812ce..dbb32dd 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -39,9 +39,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - var key string var err error + var key string + key, err = sources.PickRandom(config.Keys.Intelx) if key == "" || err != nil { return diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index bde9baf..5142fec 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -38,9 +38,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha for page := 1; ; page++ { reqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", domain, 200, page) - var res *fasthttp.Response var err error + var res *fasthttp.Response + res, err = httpclient.SimpleGet(reqURL) if err != nil { return diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index 3e352c0..d66ca22 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -35,9 +35,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - var key string var err error + var key string + key, err = sources.PickRandom(config.Keys.URLScan) if err != nil { return diff --git a/pkg/xurlfind3r/sources/wayback/waybackrobots.go b/pkg/xurlfind3r/sources/wayback/waybackrobots.go index 5fafa87..4ed89d4 100644 --- a/pkg/xurlfind3r/sources/wayback/waybackrobots.go +++ b/pkg/xurlfind3r/sources/wayback/waybackrobots.go @@ -10,7 +10,7 @@ import ( "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) -func parseWaybackRobots(config *sources.Configuration, URL string) (robotsURLs chan string) { +func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan string) { robotsURLs = make(chan string) robotsEntryRegex := regexp.MustCompile(`Disallow:\s?.+`) diff --git a/pkg/xurlfind3r/sources/wayback/waybacksource.go b/pkg/xurlfind3r/sources/wayback/waybacksource.go index e288574..329c097 100644 --- a/pkg/xurlfind3r/sources/wayback/waybacksource.go +++ b/pkg/xurlfind3r/sources/wayback/waybacksource.go @@ -17,6 +17,7 @@ func parseWaybackSource(domain, URL string) (sourceURLs chan string) { defer close(sourceURLs) var err error + var snapshots [][2]string snapshots, err = getWaybackSnapshots(URL) From c7c861572b18acee67665004762780ad30589145 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:05:53 +0300 Subject: [PATCH 12/24] refactor: Revise bevigil workings --- pkg/xurlfind3r/sources/bevigil/bevigil.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 4433739..1a4e110 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -9,7 +9,7 @@ import ( "github.com/valyala/fasthttp" ) -type response struct { +type getURLsResponse struct { Domain string `json:"domain"` URLs []string `json:"urls"` } @@ -46,13 +46,13 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } - var responseData response + var getURLsResponseData getURLsResponse - if err = json.Unmarshal(res.Body(), &responseData); err != nil { + if err = json.Unmarshal(res.Body(), &getURLsResponseData); err != nil { return } - for _, URL := range responseData.URLs { + for _, URL := range getURLsResponseData.URLs { if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } From 38cdb8a86c2439d9f5bd75be53fc30102b304a84 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:20:04 +0300 Subject: [PATCH 13/24] refactor: Revise commoncrawl workings --- .../sources/commoncrawl/commoncrawl.go | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 8fc9c7b..6752eb6 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -1,4 +1,3 @@ -// Package commoncrawl implements functions to search URLs from commoncrawl. package commoncrawl import ( @@ -13,12 +12,12 @@ import ( "github.com/valyala/fasthttp" ) -type indexesResponse []struct { +type getIndexesResponse []struct { ID string `json:"id"` API string `json:"cdx-API"` } -type response struct { +type getURLsResponse struct { URL string `json:"url"` Error string `json:"error"` } @@ -28,59 +27,63 @@ type Source struct{} func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) { URLsChannel = make(chan sources.URL) + if config.IncludeSubdomains { + domain = "*." + domain + } + go func() { defer close(URLsChannel) var err error - var indexesRes *fasthttp.Response + var getIndexesRes *fasthttp.Response - indexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") + getIndexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") if err != nil { return } - var indexesResponseData indexesResponse + var getIndexesResData getIndexesResponse - if err = json.Unmarshal(indexesRes.Body(), &indexesResponseData); err != nil { + if err = json.Unmarshal(getIndexesRes.Body(), &getIndexesResData); err != nil { return } wg := new(sync.WaitGroup) - for _, indexData := range indexesResponseData { + for _, indexData := range getIndexesResData { wg.Add(1) go func(API string) { defer wg.Done() - contentReqHeaders := map[string]string{ + getURLsReqHeaders := map[string]string{ "Host": "index.commoncrawl.org", } var err error - var contentRes *fasthttp.Response + var getURLsRes *fasthttp.Response - contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, domain), "", contentReqHeaders) + getURLsRes, err = httpclient.Get(fmt.Sprintf("%s?url=%s/*&output=json&fl=url", API, domain), "", getURLsReqHeaders) if err != nil { return } - scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body())) + scanner := bufio.NewScanner(bytes.NewReader(getURLsRes.Body())) for scanner.Scan() { - var data response + var getURLsResData getURLsResponse - if err = json.Unmarshal(scanner.Bytes(), &data); err != nil { + if err = json.Unmarshal(scanner.Bytes(), &getURLsResData); err != nil { return } - if data.Error != "" { + if getURLsResData.Error != "" { return } - URL := data.URL + URL := getURLsResData.URL if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return From bcf3203c985dbe9a0978960881ea75fb7a05fd75 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:24:28 +0300 Subject: [PATCH 14/24] chore: - --- pkg/xurlfind3r/sources/bevigil/bevigil.go | 16 ++++++++-------- .../sources/commoncrawl/commoncrawl.go | 8 ++++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 1a4e110..e08a70f 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -31,28 +31,28 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } - reqHeaders := map[string]string{} + getURLsReqHeaders := map[string]string{} if len(config.Keys.Bevigil) > 0 { - reqHeaders["X-Access-Token"] = key + getURLsReqHeaders["X-Access-Token"] = key } - reqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", domain) + getURLsReqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", domain) - var res *fasthttp.Response + var getURLsRes *fasthttp.Response - res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", reqHeaders, nil) + getURLsRes, err = httpclient.Request(fasthttp.MethodGet, getURLsReqURL, "", getURLsReqHeaders, nil) if err != nil { return } - var getURLsResponseData getURLsResponse + var getURLsResData getURLsResponse - if err = json.Unmarshal(res.Body(), &getURLsResponseData); err != nil { + if err = json.Unmarshal(getURLsRes.Body(), &getURLsResData); err != nil { return } - for _, URL := range getURLsResponseData.URLs { + for _, URL := range getURLsResData.URLs { if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 6752eb6..14ef985 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -34,11 +34,13 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) + getIndexesReqURL := "https://index.commoncrawl.org/collinfo.json" + var err error var getIndexesRes *fasthttp.Response - getIndexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json") + getIndexesRes, err = httpclient.SimpleGet(getIndexesReqURL) if err != nil { return } @@ -61,11 +63,13 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha "Host": "index.commoncrawl.org", } + getURLsReqURL := fmt.Sprintf("%s?url=%s/*&output=json&fl=url", API, domain) + var err error var getURLsRes *fasthttp.Response - getURLsRes, err = httpclient.Get(fmt.Sprintf("%s?url=%s/*&output=json&fl=url", API, domain), "", getURLsReqHeaders) + getURLsRes, err = httpclient.Get(getURLsReqURL, "", getURLsReqHeaders) if err != nil { return } From 0f59c084903aa9e73b33d6e4a014b3511249fdfa Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:58:28 +0300 Subject: [PATCH 15/24] refactor: Revise urlscan workings --- pkg/xurlfind3r/sources/urlscan/urlscan.go | 45 +++++++++-------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index d66ca22..cc55f92 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -1,9 +1,8 @@ -// Package urlscan implements functions to search URLs from urlscan. package urlscan import ( "encoding/json" - "net/url" + "fmt" "strings" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" @@ -11,7 +10,7 @@ import ( "github.com/valyala/fasthttp" ) -type response struct { +type searchResponse struct { Results []struct { Page struct { Domain string `json:"domain"` @@ -44,65 +43,57 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } - reqHeaders := map[string]string{ + searchReqHeaders := map[string]string{ "Content-Type": "application/json", } if key != "" { - reqHeaders["API-Key"] = key + searchReqHeaders["API-Key"] = key } var searchAfter []interface{} for { - baseURL := "https://urlscan.io/api/v1/search/" - params := url.Values{} - params.Set("q", domain) + after := "" if searchAfter != nil { searchAfterJSON, _ := json.Marshal(searchAfter) - params.Set("search_after", string(searchAfterJSON)) + after = "&search_after=" + string(searchAfterJSON) } - reqURL := baseURL + "?" + params.Encode() + searchReqURL := fmt.Sprintf("https://urlscan.io/api/v1/search/?q=domain:%s&size=100", domain) + after - var res *fasthttp.Response + var searchRes *fasthttp.Response - res, err = httpclient.Request(fasthttp.MethodGet, reqURL, "", reqHeaders, nil) + searchRes, err = httpclient.Get(searchReqURL, "", searchReqHeaders) if err != nil { return } - var responseData response + var searchResData searchResponse - if err = json.Unmarshal(res.Body(), &responseData); err != nil { + if err = json.Unmarshal(searchRes.Body(), &searchResData); err != nil { return } - if responseData.Status == 429 { + if searchResData.Status == 429 { break } - for _, result := range responseData.Results { - URL := result.Page.URL - - if result.Page.Domain != domain || - !strings.HasSuffix(result.Page.Domain, domain) { - continue - } - - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + for _, result := range searchResData.Results { + if (result.Page.Domain != domain && result.Page.Domain != "www."+domain) && + (config.IncludeSubdomains && !strings.HasSuffix(result.Page.Domain, domain)) { return } - URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + URLsChannel <- sources.URL{Source: source.Name(), Value: result.Page.URL} } - if !responseData.HasMore { + if !searchResData.HasMore { break } - lastResult := responseData.Results[len(responseData.Results)-1] + lastResult := searchResData.Results[len(searchResData.Results)-1] searchAfter = lastResult.Sort } }() From e06f168e512d6044d2fb0f30f7b59d4895d7abfb Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:59:46 +0300 Subject: [PATCH 16/24] chore: - --- pkg/xurlfind3r/sources/bevigil/bevigil.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index e08a70f..0304a10 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -41,7 +41,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha var getURLsRes *fasthttp.Response - getURLsRes, err = httpclient.Request(fasthttp.MethodGet, getURLsReqURL, "", getURLsReqHeaders, nil) + getURLsRes, err = httpclient.Get(getURLsReqURL, "", getURLsReqHeaders) if err != nil { return } From f758d27f193fc84c42c6c3ab7b4410f194d92aae Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 17:54:36 +0300 Subject: [PATCH 17/24] refactor: Revise otx workings --- pkg/xurlfind3r/sources/otx/otx.go | 38 +++++++++++++++++++------------ 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index 5142fec..eea1f0a 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -1,26 +1,31 @@ -// Package otx implements functions to search URLs from otx. package otx import ( "encoding/json" "fmt" + "github.com/hueristiq/hqgourl" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/valyala/fasthttp" ) -type response struct { +type getURLsResponse struct { URLList []struct { - Domain string `json:"domain"` URL string `json:"url"` + Domain string `json:"domain"` Hostname string `json:"hostname"` + Result struct { + URLWorker struct { + IP string `json:"ip"` + HTTPCode int `json:"http_code"` + } `json:"urlworker"` + } `json:"result"` HTTPCode int `json:"httpcode"` - PageNum int `json:"page_num"` - FullSize int `json:"full_size"` - Paged bool `json:"paged"` + Encoded string `json:"encoded"` } `json:"url_list"` PageNum int `json:"page_num"` + Limit int `json:"limit"` Paged bool `json:"paged"` HasNext bool `json:"has_next"` FullSize int `json:"full_size"` @@ -35,33 +40,38 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) + parseURL, err := hqgourl.Parse(domain) + if err != nil { + return + } + for page := 1; ; page++ { - reqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=%d&page=%d", domain, 200, page) + getURLsReqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=100&page=%d", parseURL.ETLDPlusOne, page) var err error - var res *fasthttp.Response + var getURLsRes *fasthttp.Response - res, err = httpclient.SimpleGet(reqURL) + getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) if err != nil { return } - var responseData response + var getURLsResData getURLsResponse - if err = json.Unmarshal(res.Body(), &responseData); err != nil { + if err = json.Unmarshal(getURLsRes.Body(), &getURLsResData); err != nil { return } - for _, URL := range responseData.URLList { + for _, URL := range getURLsResData.URLList { if !sources.IsInScope(URL.URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL.URL} } - if !responseData.HasNext { + if !getURLsResData.HasNext { break } } From 3125d9a52f69b162ec4ab383772a8cb67bd2126f Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 18:09:03 +0300 Subject: [PATCH 18/24] refactor: Scope control --- pkg/xurlfind3r/sources/bevigil/bevigil.go | 2 +- pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go | 2 +- pkg/xurlfind3r/sources/github/github.go | 4 ++-- pkg/xurlfind3r/sources/intelx/intelx.go | 2 +- pkg/xurlfind3r/sources/otx/otx.go | 8 +++++--- pkg/xurlfind3r/sources/urlscan/urlscan.go | 10 +++++----- pkg/xurlfind3r/sources/utils.go | 13 +++++++++++-- pkg/xurlfind3r/sources/wayback/wayback.go | 4 ++-- 8 files changed, 28 insertions(+), 17 deletions(-) diff --git a/pkg/xurlfind3r/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go index 0304a10..2bec025 100644 --- a/pkg/xurlfind3r/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -54,7 +54,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha for _, URL := range getURLsResData.URLs { if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} diff --git a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 14ef985..dd2f4a5 100644 --- a/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -90,7 +90,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha URL := getURLsResData.URL if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 4a81277..c092b41 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -141,7 +141,7 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh URL = parsedURL.String() if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} @@ -166,7 +166,7 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh URL = parsedURL.String() if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index dbb32dd..f90a32a 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -113,7 +113,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha } if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} diff --git a/pkg/xurlfind3r/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go index eea1f0a..de43893 100644 --- a/pkg/xurlfind3r/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -63,12 +63,14 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } - for _, URL := range getURLsResData.URLList { - if !sources.IsInScope(URL.URL, domain, config.IncludeSubdomains) { + for _, item := range getURLsResData.URLList { + URL := item.URL + + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { continue } - URLsChannel <- sources.URL{Source: source.Name(), Value: URL.URL} + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } if !getURLsResData.HasNext { diff --git a/pkg/xurlfind3r/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go index cc55f92..f9840cc 100644 --- a/pkg/xurlfind3r/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -3,7 +3,6 @@ package urlscan import ( "encoding/json" "fmt" - "strings" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" @@ -81,12 +80,13 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha } for _, result := range searchResData.Results { - if (result.Page.Domain != domain && result.Page.Domain != "www."+domain) && - (config.IncludeSubdomains && !strings.HasSuffix(result.Page.Domain, domain)) { - return + URL := result.Page.URL + + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + continue } - URLsChannel <- sources.URL{Source: source.Name(), Value: result.Page.URL} + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } if !searchResData.HasMore { diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 1ded7c8..8ebb158 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -39,9 +39,18 @@ func IsInScope(URL, domain string, includeSubdomains bool) (isInScope bool) { return } + parsedDomain, err := hqgourl.Parse(domain) + if err != nil { + return + } + + if parsedURL.ETLDPlusOne != parsedDomain.ETLDPlusOne { + return + } + if !includeSubdomains && - parsedURL.Domain != domain && - parsedURL.Domain != "www."+domain { + parsedURL.Domain != parsedDomain.Domain && + parsedURL.Domain != "www."+parsedDomain.Domain { return } diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go index ccabc65..f98c779 100644 --- a/pkg/xurlfind3r/sources/wayback/wayback.go +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -90,7 +90,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha robotsURLsRegex.MatchString(URL) { for robotsURL := range parseWaybackRobots(config, URL) { if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name() + ":robots", Value: robotsURL} @@ -99,7 +99,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha !robotsURLsRegex.MatchString(URL) { for sourceURL := range parseWaybackSource(domain, URL) { if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return + continue } URLsChannel <- sources.URL{Source: source.Name() + ":source", Value: sourceURL} From 11134a89d9dc91844a12e0a59ace18f1b1252f9a Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 19:06:52 +0300 Subject: [PATCH 19/24] refactor: Revise github workings --- pkg/xurlfind3r/sources/github/github.go | 71 +++++++++---------------- pkg/xurlfind3r/sources/github/utils.go | 10 ---- 2 files changed, 25 insertions(+), 56 deletions(-) diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index c092b41..9bad609 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -1,9 +1,6 @@ -// Package github implements functions to search URLsChannel from github. package github import ( - "bufio" - "bytes" "encoding/json" "fmt" "net/url" @@ -18,7 +15,7 @@ import ( "github.com/valyala/fasthttp" ) -type response struct { +type searchResponse struct { TotalCount int `json:"total_count"` Items []struct { Name string `json:"name"` @@ -43,15 +40,15 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha tokens := NewTokenManager(config.Keys.GitHub) - searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", domain) + searchReqURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", domain) - source.Enumerate(searchURL, domain, tokens, URLsChannel, config) + source.Enumerate(searchReqURL, domain, tokens, URLsChannel, config) }() return URLsChannel } -func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { +func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, URLsChannel chan sources.URL, config *sources.Configuration) { token := tokens.Get() if token.RetryAfter > 0 { @@ -62,7 +59,7 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh } } - reqHeaders := map[string]string{ + searchReqHeaders := map[string]string{ "Accept": "application/vnd.github.v3.text-match+json", "Authorization": "token " + token.Hash, } @@ -71,7 +68,7 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh var searchRes *fasthttp.Response - searchRes, err = httpclient.Request(fasthttp.MethodGet, searchURL, "", reqHeaders, nil) + searchRes, err = httpclient.Get(searchReqURL, "", searchReqHeaders) isForbidden := searchRes != nil && searchRes.StatusCode() == fasthttp.StatusForbidden @@ -85,10 +82,10 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh tokens.setCurrentTokenExceeded(retryAfterSeconds) - source.Enumerate(searchURL, domain, tokens, URLsChannel, config) + source.Enumerate(searchReqURL, domain, tokens, URLsChannel, config) } - var searchResData response + var searchResData searchResponse if err = json.Unmarshal(searchRes.Body(), &searchResData); err != nil { return @@ -96,64 +93,46 @@ func (source *Source) Enumerate(searchURL, domain string, tokens *Tokens, URLsCh var mdExtractor *regexp.Regexp - // (\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)? - // (?:.*\.)? mdExtractor, err = hqgourl.Extractor.ModerateMatchHost(`(\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)?` + regexp.QuoteMeta(domain)) if err != nil { return } - // Process Items - for index := range searchResData.Items { - item := searchResData.Items[index] + for _, item := range searchResData.Items { + getRawContentReqURL := getRawContentURL(item.HTMLURL) - reqURL := getRawContentURL(item.HTMLURL) + var getRawContentRes *fasthttp.Response - var contentRes *fasthttp.Response - - contentRes, err = httpclient.SimpleGet(reqURL) + getRawContentRes, err = httpclient.SimpleGet(getRawContentReqURL) if err != nil { continue } - if contentRes.StatusCode() != fasthttp.StatusOK { + if getRawContentRes.StatusCode() != fasthttp.StatusOK { continue } - scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body())) - - for scanner.Scan() { - line := scanner.Text() - if line == "" { - continue - } - - URLs := mdExtractor.FindAllString(normalizeContent(line), -1) - - for _, URL := range URLs { - URL = fixURL(URL) + URLs := mdExtractor.FindAllString(string(getRawContentRes.Body()), -1) - parsedURL, err := hqgourl.Parse(URL) - if err != nil { - return - } + for _, URL := range URLs { + URL = fixURL(URL) - URL = parsedURL.String() + parsedURL, err := hqgourl.Parse(URL) + if err != nil { + return + } - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - continue - } + URL = parsedURL.String() - URLsChannel <- sources.URL{Source: source.Name(), Value: URL} + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + continue } - } - if scanner.Err() != nil { - return + URLsChannel <- sources.URL{Source: source.Name(), Value: URL} } for _, textMatch := range item.TextMatches { - URLs := mdExtractor.FindAllString(normalizeContent(textMatch.Fragment), -1) + URLs := mdExtractor.FindAllString(textMatch.Fragment, -1) for _, URL := range URLs { URL = fixURL(URL) diff --git a/pkg/xurlfind3r/sources/github/utils.go b/pkg/xurlfind3r/sources/github/utils.go index 9787b24..04dfccb 100644 --- a/pkg/xurlfind3r/sources/github/utils.go +++ b/pkg/xurlfind3r/sources/github/utils.go @@ -1,7 +1,6 @@ package github import ( - "net/url" "strings" ) @@ -13,15 +12,6 @@ func getRawContentURL(URL string) (rawContentURL string) { return } -func normalizeContent(content string) (normalizedContent string) { - normalizedContent = content - normalizedContent, _ = url.QueryUnescape(normalizedContent) - normalizedContent = strings.ReplaceAll(normalizedContent, "\\t", "") - normalizedContent = strings.ReplaceAll(normalizedContent, "\\n", "") - - return -} - func fixURL(URL string) (fixedURL string) { fixedURL = URL From 8e840b769e14c8d7819e7707803180a5e22314dd Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 26 Jul 2023 20:43:03 +0300 Subject: [PATCH 20/24] refactor: Revise intelx workings --- pkg/xurlfind3r/sources/github/github.go | 4 +- pkg/xurlfind3r/sources/github/utils.go | 97 ------------------------ pkg/xurlfind3r/sources/intelx/intelx.go | 54 +++++++------- pkg/xurlfind3r/sources/utils.go | 98 +++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 125 deletions(-) diff --git a/pkg/xurlfind3r/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go index 9bad609..131752a 100644 --- a/pkg/xurlfind3r/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -115,7 +115,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, URL URLs := mdExtractor.FindAllString(string(getRawContentRes.Body()), -1) for _, URL := range URLs { - URL = fixURL(URL) + URL = sources.FixURL(URL) parsedURL, err := hqgourl.Parse(URL) if err != nil { @@ -135,7 +135,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, URL URLs := mdExtractor.FindAllString(textMatch.Fragment, -1) for _, URL := range URLs { - URL = fixURL(URL) + URL = sources.FixURL(URL) parsedURL, err := hqgourl.Parse(URL) if err != nil { diff --git a/pkg/xurlfind3r/sources/github/utils.go b/pkg/xurlfind3r/sources/github/utils.go index 04dfccb..832d40c 100644 --- a/pkg/xurlfind3r/sources/github/utils.go +++ b/pkg/xurlfind3r/sources/github/utils.go @@ -11,100 +11,3 @@ func getRawContentURL(URL string) (rawContentURL string) { return } - -func fixURL(URL string) (fixedURL string) { - fixedURL = URL - - // ',",`, - quotes := []rune{'\'', '"', '`'} - - for i := range quotes { - quote := quotes[i] - - indexOfQuote := findUnbalancedQuote(URL, quote) - if indexOfQuote <= len(fixedURL) && indexOfQuote >= 0 { - fixedURL = fixedURL[:indexOfQuote] - } - } - - // (),[],{} - parentheses := []struct { - Opening, Closing rune - }{ - {'[', ']'}, - {'(', ')'}, - {'{', '}'}, - } - - for i := range parentheses { - parenthesis := parentheses[i] - - indexOfParenthesis := findUnbalancedBracket(URL, parenthesis.Opening, parenthesis.Closing) - if indexOfParenthesis <= len(fixedURL) && indexOfParenthesis >= 0 { - fixedURL = fixedURL[:indexOfParenthesis] - } - } - - // ; - indexOfComma := strings.Index(fixedURL, ";") - if indexOfComma <= len(fixedURL) && indexOfComma >= 0 { - fixedURL = fixedURL[:indexOfComma] - } - - return -} - -func findUnbalancedQuote(s string, quoteChar rune) int { - insideQuotes := false - - for _, ch := range s { - if ch == quoteChar { - if insideQuotes { - insideQuotes = false - } else { - insideQuotes = true - } - } - } - - // If still inside quotes at the end of the string, - // find the index of the opening quote - if insideQuotes { - for i, ch := range s { - if ch == quoteChar { - return i - } - } - } - - return -1 // return -1 if all quotes are balanced -} - -func findUnbalancedBracket(s string, openChar, closeChar rune) int { - openCount := 0 - - var firstOpenIndex int - - for i, ch := range s { - if ch == openChar { - if openCount == 0 { - firstOpenIndex = i - } - - openCount++ - } else if ch == closeChar { - openCount-- - - if openCount < 0 { - return i // Found an unbalanced closing bracket - } - } - } - - // If there are unmatched opening brackets - if openCount > 0 { - return firstOpenIndex - } - - return -1 // All brackets are balanced -} diff --git a/pkg/xurlfind3r/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go index f90a32a..929d623 100644 --- a/pkg/xurlfind3r/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -1,13 +1,12 @@ -// Package intelx implements functions to search URLs from intelx. package intelx import ( "encoding/json" "fmt" - "net/mail" "strings" "time" + "github.com/hueristiq/hqgourl" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient" "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/valyala/fasthttp" @@ -16,6 +15,7 @@ import ( type searchRequest struct { Term string `json:"term"` Timeout time.Duration `json:"timeout"` + Target int `json:"target"` MaxResults int `json:"maxresults"` Media int `json:"media"` } @@ -24,7 +24,7 @@ type searchResponse struct { Status int `json:"status"` } -type resultsResponse struct { +type getResultsResponse struct { Selectors []struct { Selectvalue string `json:"selectorvalue"` } `json:"selectors"` @@ -60,58 +60,67 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } - searchURL := fmt.Sprintf("https://%s/phonebook/search?k=%s", intelXHost, intelXKey) + searchReqURL := fmt.Sprintf("https://%s/phonebook/search?k=%s", intelXHost, intelXKey) searchReqBody := searchRequest{ - Term: domain, + Term: "*" + domain, MaxResults: 100000, Media: 0, + Target: 3, // 1 = Domains | 2 = Emails | 3 = URLs Timeout: 20, } - var body []byte + var searchReqBodyBytes []byte - body, err = json.Marshal(searchReqBody) + searchReqBodyBytes, err = json.Marshal(searchReqBody) if err != nil { return } - var res *fasthttp.Response + var searchRes *fasthttp.Response - res, err = httpclient.SimplePost(searchURL, "application/json", body) + searchRes, err = httpclient.SimplePost(searchReqURL, "application/json", searchReqBodyBytes) if err != nil { return } - var searchResponseData searchResponse + var searchResData searchResponse - if err = json.Unmarshal(res.Body(), &searchResponseData); err != nil { + if err = json.Unmarshal(searchRes.Body(), &searchResData); err != nil { return } - resultsURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, searchResponseData.ID) + getResultsReqURL := fmt.Sprintf("https://%s/phonebook/search/result?k=%s&id=%s&limit=10000", intelXHost, intelXKey, searchResData.ID) status := 0 for status == 0 || status == 3 { - res, err = httpclient.Get(resultsURL, "", nil) + var getResultsRes *fasthttp.Response + + getResultsRes, err = httpclient.Get(getResultsReqURL, "", nil) if err != nil { return } - var resultsResponseData resultsResponse + var getResultsResData getResultsResponse - if err = json.Unmarshal(res.Body(), &resultsResponseData); err != nil { + if err = json.Unmarshal(getResultsRes.Body(), &getResultsResData); err != nil { return } - status = resultsResponseData.Status + status = getResultsResData.Status - for _, hostname := range resultsResponseData.Selectors { + for _, hostname := range getResultsResData.Selectors { URL := hostname.Selectvalue + URL = sources.FixURL(URL) - if isEmail(URL) { - continue + parsedURL, err := hqgourl.Parse(URL) + if err != nil { + return } + parsedURL.Path = strings.Split(parsedURL.Path, ":")[0] + + URL = parsedURL.String() + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { continue } @@ -124,13 +133,6 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } -func isEmail(URL string) (isEmail bool) { - _, err := mail.ParseAddress(URL) - isEmail = err == nil - - return -} - func (source *Source) Name() string { return "intelx" } diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 8ebb158..0a7d529 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -4,6 +4,7 @@ import ( "crypto/rand" "fmt" "math/big" + "strings" "github.com/hueristiq/hqgourl" ) @@ -58,3 +59,100 @@ func IsInScope(URL, domain string, includeSubdomains bool) (isInScope bool) { return } + +func FixURL(URL string) (fixedURL string) { + fixedURL = URL + + // ',",`, + quotes := []rune{'\'', '"', '`'} + + for i := range quotes { + quote := quotes[i] + + indexOfQuote := findUnbalancedQuote(URL, quote) + if indexOfQuote <= len(fixedURL) && indexOfQuote >= 0 { + fixedURL = fixedURL[:indexOfQuote] + } + } + + // (),[],{} + parentheses := []struct { + Opening, Closing rune + }{ + {'[', ']'}, + {'(', ')'}, + {'{', '}'}, + } + + for i := range parentheses { + parenthesis := parentheses[i] + + indexOfParenthesis := findUnbalancedBracket(URL, parenthesis.Opening, parenthesis.Closing) + if indexOfParenthesis <= len(fixedURL) && indexOfParenthesis >= 0 { + fixedURL = fixedURL[:indexOfParenthesis] + } + } + + // ; + indexOfComma := strings.Index(fixedURL, ";") + if indexOfComma <= len(fixedURL) && indexOfComma >= 0 { + fixedURL = fixedURL[:indexOfComma] + } + + return +} + +func findUnbalancedQuote(s string, quoteChar rune) int { + insideQuotes := false + + for _, ch := range s { + if ch == quoteChar { + if insideQuotes { + insideQuotes = false + } else { + insideQuotes = true + } + } + } + + // If still inside quotes at the end of the string, + // find the index of the opening quote + if insideQuotes { + for i, ch := range s { + if ch == quoteChar { + return i + } + } + } + + return -1 // return -1 if all quotes are balanced +} + +func findUnbalancedBracket(s string, openChar, closeChar rune) int { + openCount := 0 + + var firstOpenIndex int + + for i, ch := range s { + if ch == openChar { + if openCount == 0 { + firstOpenIndex = i + } + + openCount++ + } else if ch == closeChar { + openCount-- + + if openCount < 0 { + return i // Found an unbalanced closing bracket + } + } + } + + // If there are unmatched opening brackets + if openCount > 0 { + return firstOpenIndex + } + + return -1 // All brackets are balanced +} From aed5b10a131b9b4936be9d40a777c2cb2c53d46d Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Thu, 27 Jul 2023 00:36:01 +0300 Subject: [PATCH 21/24] refactor: Revise wayback workings --- pkg/xurlfind3r/sources/wayback/wayback.go | 56 ++++++++----------- .../sources/wayback/waybackrobots.go | 24 +++----- .../sources/wayback/waybacksource.go | 4 +- 3 files changed, 31 insertions(+), 53 deletions(-) diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go index f98c779..3cda0b7 100644 --- a/pkg/xurlfind3r/sources/wayback/wayback.go +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -1,4 +1,3 @@ -// Package wayback implements functions to search URLs from wayback. package wayback import ( @@ -30,32 +29,25 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - // Get wayback URLs waybackURLs := make(chan string) go func() { defer close(waybackURLs) - var ( - err error - results []string - ) - if config.IncludeSubdomains { domain = "*." + domain } - results, err = getWaybackURLs(domain) + var err error + + var URLs []string + + URLs, err = getWaybackURLs(domain) if err != nil { return } - for index := range results { - URL := results[index] - if URL == "" { - continue - } - + for _, URL := range URLs { waybackURLs <- URL } }() @@ -63,7 +55,6 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha mediaURLRegex := regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf)(?:\?|#|$)`) robotsURLsRegex := regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`) - // Process wayback Snapshots wg := &sync.WaitGroup{} for URL := range waybackURLs { @@ -117,20 +108,18 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha func getWaybackURLs(domain string) (URLs []string, err error) { URLs = []string{} - var ( - res *fasthttp.Response - ) - limiter.Wait() - reqURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=txt&fl=original&collapse=urlkey", domain) + getURLsReqURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=txt&fl=original&collapse=urlkey", domain) + + var getURLsRes *fasthttp.Response - res, err = httpclient.SimpleGet(reqURL) + getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) if err != nil { return } - scanner := bufio.NewScanner(bytes.NewReader(res.Body())) + scanner := bufio.NewScanner(bytes.NewReader(getURLsRes.Body())) for scanner.Scan() { URL := scanner.Text() @@ -149,24 +138,22 @@ func getWaybackURLs(domain string) (URLs []string, err error) { } func getWaybackSnapshots(URL string) (snapshots [][2]string, err error) { - var ( - res *fasthttp.Response - ) - limiter.Wait() - reqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&collapse=digest", URL) + getSnapshotsReqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&collapse=digest", URL) + + var getSnapshotsRes *fasthttp.Response - res, err = httpclient.SimpleGet(reqURL) + getSnapshotsRes, err = httpclient.SimpleGet(getSnapshotsReqURL) if err != nil { return } - if res.Header.ContentLength() == 0 { + if getSnapshotsRes.Header.ContentLength() == 0 { return } - if err = json.Unmarshal(res.Body(), &snapshots); err != nil { + if err = json.Unmarshal(getSnapshotsRes.Body(), &snapshots); err != nil { return } @@ -183,19 +170,20 @@ func getWaybackContent(snapshot [2]string) (content string, err error) { var ( timestamp = snapshot[0] URL = snapshot[1] - res *fasthttp.Response ) limiter.Wait() - reqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", timestamp, URL) + getSnapshotContentReqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", timestamp, URL) + + var getSnapshotContentRes *fasthttp.Response - res, err = httpclient.SimpleGet(reqURL) + getSnapshotContentRes, err = httpclient.SimpleGet(getSnapshotContentReqURL) if err != nil { return } - content = string(res.Body()) + content = string(getSnapshotContentRes.Body()) if content == "" { return diff --git a/pkg/xurlfind3r/sources/wayback/waybackrobots.go b/pkg/xurlfind3r/sources/wayback/waybackrobots.go index 4ed89d4..3fa1229 100644 --- a/pkg/xurlfind3r/sources/wayback/waybackrobots.go +++ b/pkg/xurlfind3r/sources/wayback/waybackrobots.go @@ -13,23 +13,19 @@ import ( func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan string) { robotsURLs = make(chan string) - robotsEntryRegex := regexp.MustCompile(`Disallow:\s?.+`) + robotsEntryRegex := regexp.MustCompile(`(Allow|Disallow):\s?.+`) go func() { defer close(robotsURLs) - // retrieve snapshots snapshots, err := getWaybackSnapshots(URL) if err != nil { return } - // retrieve and parse snapshots' content for robotsURLs wg := &sync.WaitGroup{} - for index := range snapshots { - row := snapshots[index] - + for _, row := range snapshots { wg.Add(1) go func(row [2]string) { @@ -40,22 +36,22 @@ func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan s return } - disallowed := robotsEntryRegex.FindAllStringSubmatch(content, -1) + matches := robotsEntryRegex.FindAllStringSubmatch(content, -1) - if len(disallowed) < 1 { + if len(matches) < 1 { return } - for index := range disallowed { - entry := disallowed[index] + for _, match := range matches { + entry := match[0] - temp := strings.Split(entry[0], "Disallow:") + temp := strings.Split(entry, ": ") if len(temp) <= 1 { continue } - robotsURL := strings.Trim(temp[1], " ") + robotsURL := temp[1] if robotsURL == "/" || robotsURL == "*" || robotsURL == "" { continue @@ -66,16 +62,12 @@ func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan s for strings.HasPrefix(robotsURL, "/") { if len(robotsURL) >= 1 { robotsURL = robotsURL[1:] // Ex. /*/test or /*/*/demo - } else { - continue } } for strings.HasSuffix(robotsURL, "/") { if len(robotsURL) >= 1 { robotsURL = robotsURL[0 : len(robotsURL)-1] - } else { - continue } } diff --git a/pkg/xurlfind3r/sources/wayback/waybacksource.go b/pkg/xurlfind3r/sources/wayback/waybacksource.go index 329c097..a5173f5 100644 --- a/pkg/xurlfind3r/sources/wayback/waybacksource.go +++ b/pkg/xurlfind3r/sources/wayback/waybacksource.go @@ -36,9 +36,7 @@ func parseWaybackSource(domain, URL string) (sourceURLs chan string) { wg := &sync.WaitGroup{} - for index := range snapshots { - row := snapshots[index] - + for _, row := range snapshots { wg.Add(1) go func(row [2]string) { From 392f436e1ada29a27540582a5078363b017ec44f Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Tue, 1 Aug 2023 20:30:35 +0300 Subject: [PATCH 22/24] chore: - --- cmd/xurlfind3r/main.go | 185 +++++++----------- pkg/xurlfind3r/sources/utils.go | 10 + pkg/xurlfind3r/sources/wayback/wayback.go | 125 ++++++------ .../sources/wayback/waybackrobots.go | 4 +- .../sources/wayback/waybacksource.go | 86 ++++---- 5 files changed, 197 insertions(+), 213 deletions(-) diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 9e2a6a1..b5fefe1 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -8,7 +8,6 @@ import ( "reflect" "strconv" "strings" - "sync" "github.com/hueristiq/hqgolog" "github.com/hueristiq/hqgolog/formatter" @@ -23,7 +22,7 @@ import ( var ( au aurora.Aurora - domainsSlice []string + domains []string domainsListFilePath string includeSubdomains bool listSources bool @@ -31,7 +30,6 @@ var ( sourcesToExclude []string parseWaybackRobots bool parseWaybackSource bool - threads int filterPattern string matchPattern string monochrome bool @@ -43,11 +41,10 @@ var ( func init() { // defaults - defaultThreads := 50 defaultYAMLConfigFile := fmt.Sprintf("~/.hueristiq/%s/config.yaml", configuration.NAME) // Handle CLI arguments, flags & help message (pflag) - pflag.StringSliceVarP(&domainsSlice, "domain", "d", []string{}, "") + pflag.StringSliceVarP(&domains, "domain", "d", []string{}, "") pflag.StringVarP(&domainsListFilePath, "list", "l", "", "") pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "") pflag.BoolVar(&listSources, "sources", false, "") @@ -55,7 +52,6 @@ func init() { pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") - pflag.IntVarP(&threads, "threads", "t", defaultThreads, "") pflag.StringVarP(&filterPattern, "filter", "f", "", "") pflag.StringVarP(&matchPattern, "match", "m", "", "") pflag.BoolVar(&monochrome, "no-color", false, "") @@ -85,9 +81,6 @@ func init() { h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" - h += "\nOPTIMIZATION:\n" - h += fmt.Sprintf(" -t, --threads int number of threads (default: %d)\n", defaultThreads) - h += "\nFILTER & MATCH:\n" h += " -f, --filter string regex to filter URLs\n" h += " -m, --match string regex to match URLs\n" @@ -135,8 +128,12 @@ func main() { fmt.Fprintln(os.Stderr, configuration.BANNER) } + var err error + + var config configuration.Configuration + // Read in configuration. - config, err := configuration.Read(YAMLConfigFile) + config, err = configuration.Read(YAMLConfigFile) if err != nil { hqgolog.Fatal().Msg(err.Error()) } @@ -168,85 +165,53 @@ func main() { } // Load input domains. - domains := make(chan string, threads) - - go func() { - defer close(domains) - wg := &sync.WaitGroup{} + // input domains: file + if domainsListFilePath != "" { + var file *os.File - // input domains: slice - if len(domainsSlice) > 0 { - wg.Add(1) - - go func() { - defer wg.Done() + file, err = os.Open(domainsListFilePath) + if err != nil { + hqgolog.Error().Msg(err.Error()) - for _, domain := range domainsSlice { - domains <- domain - } - }() + return } - // input domains: file - if domainsListFilePath != "" { - wg.Add(1) - - go func() { - defer wg.Done() - - file, err := os.Open(domainsListFilePath) - if err != nil { - hqgolog.Error().Msg(err.Error()) + scanner := bufio.NewScanner(file) - return - } + for scanner.Scan() { + domain := scanner.Text() - scanner := bufio.NewScanner(file) - - for scanner.Scan() { - domain := scanner.Text() - - if domain != "" { - domains <- domain - } - } - - if err := scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) - - return - } - }() + if domain != "" { + domains = append(domains, domain) + } } - // input domains: stdin - if hasStdin() { - wg.Add(1) + if err = scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) - go func() { - defer wg.Done() + return + } + } - scanner := bufio.NewScanner(os.Stdin) + // input domains: stdin + if hasStdin() { + scanner := bufio.NewScanner(os.Stdin) - for scanner.Scan() { - domain := scanner.Text() + for scanner.Scan() { + domain := scanner.Text() - if domain != "" { - domains <- domain - } - } + if domain != "" { + domains = append(domains, domain) + } + } - if err := scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) + if err = scanner.Err(); err != nil { + hqgolog.Error().Msg(err.Error()) - return - } - }() + return } - - wg.Wait() - }() + } // Find and output URLs. var consolidatedWriter *bufio.Writer @@ -256,7 +221,9 @@ func main() { mkdir(directory) - consolidatedFile, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + var consolidatedFile *os.File + + consolidatedFile, err = os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { hqgolog.Fatal().Msg(err.Error()) } @@ -270,57 +237,49 @@ func main() { mkdir(outputDirectory) } - wg := &sync.WaitGroup{} + options := &xurlfind3r.Options{ + IncludeSubdomains: includeSubdomains, + SourcesToUSe: sourcesToUse, + SourcesToExclude: sourcesToExclude, + Keys: config.Keys, + ParseWaybackRobots: parseWaybackRobots, + ParseWaybackSource: parseWaybackSource, + FilterPattern: filterPattern, + Matchattern: matchPattern, + } - for i := 0; i < threads; i++ { - wg.Add(1) + var finder *xurlfind3r.Finder - go func() { - defer wg.Done() + finder, err = xurlfind3r.New(options) + if err != nil { + hqgolog.Error().Msg(err.Error()) - options := &xurlfind3r.Options{ - IncludeSubdomains: includeSubdomains, - SourcesToUSe: sourcesToUse, - SourcesToExclude: sourcesToExclude, - Keys: config.Keys, - ParseWaybackRobots: parseWaybackRobots, - ParseWaybackSource: parseWaybackSource, - FilterPattern: filterPattern, - Matchattern: matchPattern, - } + return + } - finder, err := xurlfind3r.New(options) + for _, domain := range domains { + URLs := finder.Find(domain) + + switch { + case output != "": + outputURLs(consolidatedWriter, URLs, verbosity) + case outputDirectory != "": + var domainFile *os.File + + domainFile, err = os.OpenFile(filepath.Join(outputDirectory, domain+".txt"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { hqgolog.Error().Msg(err.Error()) return } - for domain := range domains { - URLs := finder.Find(domain) - - switch { - case output != "": - outputURLs(consolidatedWriter, URLs, verbosity) - case outputDirectory != "": - domainFile, err := os.OpenFile(filepath.Join(outputDirectory, domain+".txt"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - hqgolog.Error().Msg(err.Error()) - - return - } + domainWriter := bufio.NewWriter(domainFile) - domainWriter := bufio.NewWriter(domainFile) - - outputURLs(domainWriter, URLs, verbosity) - default: - outputURLs(nil, URLs, verbosity) - } - } - }() + outputURLs(domainWriter, URLs, verbosity) + default: + outputURLs(nil, URLs, verbosity) + } } - - wg.Wait() } func hasStdin() bool { diff --git a/pkg/xurlfind3r/sources/utils.go b/pkg/xurlfind3r/sources/utils.go index 0a7d529..bda4c5d 100644 --- a/pkg/xurlfind3r/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -4,6 +4,7 @@ import ( "crypto/rand" "fmt" "math/big" + "net/url" "strings" "github.com/hueristiq/hqgourl" @@ -63,6 +64,15 @@ func IsInScope(URL, domain string, includeSubdomains bool) (isInScope bool) { func FixURL(URL string) (fixedURL string) { fixedURL = URL + // remove beginning and ending quotes + fixedURL = strings.Trim(fixedURL, "\"") + fixedURL = strings.Trim(fixedURL, "'") + + fixedURL, _ = url.QueryUnescape(fixedURL) + + // remove beginning and ending spaces + fixedURL = strings.Trim(fixedURL, " ") + // ',",`, quotes := []rune{'\'', '"', '`'} diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go index 3cda0b7..91c0f6c 100644 --- a/pkg/xurlfind3r/sources/wayback/wayback.go +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -1,8 +1,6 @@ package wayback import ( - "bufio" - "bytes" "encoding/json" "fmt" "regexp" @@ -29,74 +27,97 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha go func() { defer close(URLsChannel) - waybackURLs := make(chan string) + var err error - go func() { - defer close(waybackURLs) + getPagesReqURL := formatURL(domain, config.IncludeSubdomains) + "&showNumPages=true" - if config.IncludeSubdomains { - domain = "*." + domain - } + limiter.Wait() + + var getPagesRes *fasthttp.Response + + getPagesRes, err = httpclient.SimpleGet(getPagesReqURL) + if err != nil { + return + } + + var pages uint + + if err = json.Unmarshal(getPagesRes.Body(), &pages); err != nil { + return + } + + waybackURLs := [][]string{} + + for page := uint(0); page < pages; page++ { + getURLsReqURL := fmt.Sprintf("%s&page=%d", formatURL(domain, config.IncludeSubdomains), page) - var err error + limiter.Wait() - var URLs []string + var getURLsRes *fasthttp.Response - URLs, err = getWaybackURLs(domain) + getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) if err != nil { return } - for _, URL := range URLs { - waybackURLs <- URL + var getURLsResData [][]string + + if err = json.Unmarshal(getURLsRes.Body(), &getURLsResData); err != nil { + return + } + + // check if there's results, wayback's pagination response + // is not always correct when using a filter + if len(getURLsResData) == 0 { + break } - }() - mediaURLRegex := regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf)(?:\?|#|$)`) + // output results + // Slicing as [1:] to skip first result by default + waybackURLs = append(waybackURLs, getURLsResData[1:]...) + } + + mediaURLRegex := regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|pdf)(?:\?|#|$)`) robotsURLsRegex := regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`) wg := &sync.WaitGroup{} - for URL := range waybackURLs { + for _, waybackURL := range waybackURLs { wg.Add(1) - go func(URL string) { + go func(waybackURL []string) { defer wg.Done() + URL := waybackURL[1] + if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { return } URLsChannel <- sources.URL{Source: source.Name(), Value: URL} - if !config.ParseWaybackRobots && !config.ParseWaybackSource { - return - } - if mediaURLRegex.MatchString(URL) { return } - if config.ParseWaybackRobots && - robotsURLsRegex.MatchString(URL) { + if config.ParseWaybackRobots && robotsURLsRegex.MatchString(URL) { for robotsURL := range parseWaybackRobots(config, URL) { - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - continue - } - URLsChannel <- sources.URL{Source: source.Name() + ":robots", Value: robotsURL} } - } else if config.ParseWaybackSource && - !robotsURLsRegex.MatchString(URL) { + + return + } + + if config.ParseWaybackSource { for sourceURL := range parseWaybackSource(domain, URL) { - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !sources.IsInScope(sourceURL, domain, config.IncludeSubdomains) { continue } URLsChannel <- sources.URL{Source: source.Name() + ":source", Value: sourceURL} } } - }(URL) + }(waybackURL) } wg.Wait() @@ -105,45 +126,23 @@ func (source *Source) Run(config *sources.Configuration, domain string) (URLsCha return } -func getWaybackURLs(domain string) (URLs []string, err error) { - URLs = []string{} - - limiter.Wait() - - getURLsReqURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=txt&fl=original&collapse=urlkey", domain) - - var getURLsRes *fasthttp.Response - - getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) - if err != nil { - return - } - - scanner := bufio.NewScanner(bytes.NewReader(getURLsRes.Body())) - - for scanner.Scan() { - URL := scanner.Text() - if URL == "" { - continue - } - - URLs = append(URLs, URL) +func formatURL(domain string, includeSubdomains bool) (URL string) { + if includeSubdomains { + domain = "*." + domain } - if err = scanner.Err(); err != nil { - return - } + URL = fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&collapse=urlkey&fl=timestamp,original,mimetype,statuscode,digest", domain) return } -func getWaybackSnapshots(URL string) (snapshots [][2]string, err error) { - limiter.Wait() - +func getSnapshots(URL string) (snapshots [][2]string, err error) { getSnapshotsReqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&collapse=digest", URL) var getSnapshotsRes *fasthttp.Response + limiter.Wait() + getSnapshotsRes, err = httpclient.SimpleGet(getSnapshotsReqURL) if err != nil { return @@ -166,16 +165,16 @@ func getWaybackSnapshots(URL string) (snapshots [][2]string, err error) { return } -func getWaybackContent(snapshot [2]string) (content string, err error) { +func getSnapshotContent(snapshot [2]string) (content string, err error) { var ( timestamp = snapshot[0] URL = snapshot[1] ) - limiter.Wait() - getSnapshotContentReqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", timestamp, URL) + limiter.Wait() + var getSnapshotContentRes *fasthttp.Response getSnapshotContentRes, err = httpclient.SimpleGet(getSnapshotContentReqURL) diff --git a/pkg/xurlfind3r/sources/wayback/waybackrobots.go b/pkg/xurlfind3r/sources/wayback/waybackrobots.go index 3fa1229..74d294b 100644 --- a/pkg/xurlfind3r/sources/wayback/waybackrobots.go +++ b/pkg/xurlfind3r/sources/wayback/waybackrobots.go @@ -18,7 +18,7 @@ func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan s go func() { defer close(robotsURLs) - snapshots, err := getWaybackSnapshots(URL) + snapshots, err := getSnapshots(URL) if err != nil { return } @@ -31,7 +31,7 @@ func parseWaybackRobots(_ *sources.Configuration, URL string) (robotsURLs chan s go func(row [2]string) { defer wg.Done() - content, err := getWaybackContent(row) + content, err := getSnapshotContent(row) if err != nil { return } diff --git a/pkg/xurlfind3r/sources/wayback/waybacksource.go b/pkg/xurlfind3r/sources/wayback/waybacksource.go index a5173f5..023df48 100644 --- a/pkg/xurlfind3r/sources/wayback/waybacksource.go +++ b/pkg/xurlfind3r/sources/wayback/waybacksource.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/hueristiq/hqgourl" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) func parseWaybackSource(domain, URL string) (sourceURLs chan string) { @@ -20,7 +21,7 @@ func parseWaybackSource(domain, URL string) (sourceURLs chan string) { var snapshots [][2]string - snapshots, err = getWaybackSnapshots(URL) + snapshots, err = getSnapshots(URL) if err != nil { return } @@ -34,6 +35,9 @@ func parseWaybackSource(domain, URL string) (sourceURLs chan string) { return } + regex1 := regexp.MustCompile(`^(//web\.archive\.org/web|https://web\.archive\.org/web|/web)/\d{14}([a-z]{2}_)?/.*`) + regex2 := regexp.MustCompile(`^https?://.*`) + wg := &sync.WaitGroup{} for _, row := range snapshots { @@ -42,62 +46,74 @@ func parseWaybackSource(domain, URL string) (sourceURLs chan string) { go func(row [2]string) { defer wg.Done() - content, err := getWaybackContent(row) + content, err := getSnapshotContent(row) if err != nil { return } - links := lxExtractor.FindAllString(content, -1) + lxURLs := lxExtractor.FindAllString(content, -1) - for index := range links { - sourceURL := links[index] + for _, lxURL := range lxURLs { + lxURL = sources.FixURL(lxURL) - // remove beginning and ending quotes - sourceURL = strings.Trim(sourceURL, "\"") - sourceURL = strings.Trim(sourceURL, "'") + // `/web/20230128054726/https://example.com/` + // `//web.archive.org/web/20230128054726/https://example.com/` + // `https://web.archive.org/web/20230128054726/https://example.com/` + // `/web/20040111155853js_/http://example.com/2003/mm_menu.js` + if regex1.MatchString(lxURL) { + URLs := mdExtractor.FindAllString(lxURL, -1) - // remove beginning and ending spaces - sourceURL = strings.Trim(sourceURL, " ") + for _, URL := range URLs { + // `https://web.archive.org/web/20001110042700/mailto:info@safaricom.co.ke`->safaricom.co.ke + if !strings.HasPrefix(URL, "http") { + continue + } - // if URL starts with `//web.archive.org/web` append scheme i.e to process it as an absolute URL - if strings.HasPrefix(sourceURL, "//web.archive.org/web") { - sourceURL = "https:" + sourceURL - } + sourceURLs <- URL + } - parsedSourceURL, err := hqgourl.Parse(sourceURL) - if err != nil { continue } - if parsedSourceURL.IsAbs() { - URLs := mdExtractor.FindAllString(sourceURL, -1) + // `http://www.safaricom.co.ke/` + // `https://web.archive.org/web/*/http://www.safaricom.co.ke/*` + // `//html5shim.googlecode.com/svn/trunk/html5.js`` + if regex2.MatchString(lxURL) || strings.HasPrefix(lxURL, `//`) { + URLs := mdExtractor.FindAllString(lxURL, -1) for _, URL := range URLs { sourceURLs <- URL } - } else { - _, _, err := mime.ParseMediaType(sourceURL) - if err == nil { - continue - } - URLs := mdExtractor.FindAllString(sourceURL, -1) + continue + } - for _, URL := range URLs { - sourceURLs <- URL - } + // text/javascript + _, _, err := mime.ParseMediaType(lxURL) + if err == nil { + continue + } - if len(URLs) > 0 { - continue - } + // `//archive.org/includes/analytics.js?v=c535ca67`` + // `archive.org/components/npm/lit/polyfill-support.js?v=c535ca67` + // `archive.org/components/npm/@webcomponents/webcomponentsjs/webcomponents-bundle.js?v=c535ca67` + // `archive.org/includes/build/js/ia-topnav.min.js?v=c535ca67` + // `archive.org/includes/build/js/archive.min.js?v=c535ca67` + // `archive.org/includes/build/css/archive.min.css?v=c535ca67` + if strings.Contains(lxURL, "archive.org") { + continue + } - // remove beginning slash - sourceURL = strings.TrimLeft(sourceURL, "/") + parsedSourceURL, err := hqgourl.Parse(URL) + if err != nil { + continue + } - sourceURL = fmt.Sprintf("%s://%s/%s", parsedSourceURL.Scheme, parsedSourceURL.Domain, sourceURL) + lxURL = strings.TrimLeft(lxURL, "/") - sourceURLs <- sourceURL - } + lxURL = fmt.Sprintf("%s://%s/%s", parsedSourceURL.Scheme, parsedSourceURL.Domain, lxURL) + + sourceURLs <- lxURL } }(row) } From dfe77e7bd4d265e9404369797bc7f8ec89938ec0 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Tue, 1 Aug 2023 20:32:25 +0300 Subject: [PATCH 23/24] docs: - --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 466da65..45ce22c 100644 --- a/README.md +++ b/README.md @@ -167,9 +167,6 @@ SOURCES: --parse-wayback-robots bool with wayback, parse robots.txt snapshots --parse-wayback-source bool with wayback, parse source code snapshots -OPTIMIZATION: - -t, --threads int number of threads (default: 50) - FILTER & MATCH: -f, --filter string regex to filter URLs -m, --match string regex to match URLs From ae777a271b50710d37cd3b3ecaee3e6a8ab482a7 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Tue, 1 Aug 2023 20:33:23 +0300 Subject: [PATCH 24/24] chore: Bump up version to 0.3.0 --- README.md | 4 ++-- internal/configuration/configuration.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 45ce22c..9c756e9 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ go install -v github.com/hueristiq/xurlfind3r/cmd/xurlfind3r@latest Example `config.yaml`: ```yaml -version: 0.2.0 +version: 0.3.0 sources: - bevigil - commoncrawl @@ -148,7 +148,7 @@ help message: __ ___ _ _ __| |/ _(_)_ __ __| |___ / _ __ \ \/ / | | | '__| | |_| | '_ \ / _` | |_ \| '__| > <| |_| | | | | _| | | | | (_| |___) | | -/_/\_\\__,_|_| |_|_| |_|_| |_|\__,_|____/|_| v0.2.0 +/_/\_\\__,_|_| |_|_| |_|_| |_|\__,_|____/|_| v0.3.0 USAGE: xurlfind3r [OPTIONS] diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index c382947..6d31b46 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -48,7 +48,7 @@ func (configuration *Configuration) Write(path string) (err error) { const ( NAME string = "xurlfind3r" - VERSION string = "0.2.0" + VERSION string = "0.3.0" ) var (