Skip to content

Commit

Permalink
实现爬虫检测中间件 (#22)
Browse files Browse the repository at this point in the history
* feat(crawlerdetect): 实现爬虫检测中间件

* feat(crawlerdetect):
- 添加 license
- 测试代码优化

* feat(crawlerdetect): 修复 license
  • Loading branch information
chenmingyong0423 authored Apr 6, 2024
1 parent b236eb6 commit 1226b60
Show file tree
Hide file tree
Showing 13 changed files with 881 additions and 3 deletions.
25 changes: 25 additions & 0 deletions internal/crawlerdetect/baidu_strategy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

type BaiduStrategy struct {
*UniversalStrategy
}

func NewBaiduStrategy() *BaiduStrategy {
return &BaiduStrategy{
UniversalStrategy: NewUniversalStrategy([]string{"baidu.com", "baidu.jp"}),
}
}
67 changes: 67 additions & 0 deletions internal/crawlerdetect/baidu_strategy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

import (
"errors"
"log"
"net"
"testing"

"github.com/stretchr/testify/require"
)

func TestBaiduStrategy(t *testing.T) {
s := NewBaiduStrategy()
require.NotNil(t, s)
testCases := []struct {
name string
ip string
matched bool
errFunc require.ErrorAssertionFunc
}{
{
name: "无效 ip",
ip: "256.0.0.0",
matched: false,
errFunc: func(t require.TestingT, err error, i ...interface{}) {
var dnsError *net.DNSError
if !errors.As(err, &dnsError) {
log.Fatal(err)
}
},
},
{
name: "非百度 ip",
ip: "166.249.90.77",
matched: false,
},
{
name: "百度 ip",
ip: "111.206.198.69",
matched: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
m, err := s.CheckCrawler(tc.ip)
if err != nil {
tc.errFunc(t, err)
}
require.Equal(t, tc.matched, m)
})
}
}
25 changes: 25 additions & 0 deletions internal/crawlerdetect/bing_strategy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

type BingStrategy struct {
*UniversalStrategy
}

func NewBingStrategy() *BingStrategy {
return &BingStrategy{
UniversalStrategy: NewUniversalStrategy([]string{"search.msn.com"}),
}
}
67 changes: 67 additions & 0 deletions internal/crawlerdetect/bing_strategy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

import (
"errors"
"log"
"net"
"testing"

"github.com/stretchr/testify/require"
)

func TestBingStrategy(t *testing.T) {
s := NewBingStrategy()
require.NotNil(t, s)
testCases := []struct {
name string
ip string
matched bool
errFunc require.ErrorAssertionFunc
}{
{
name: "无效 ip",
ip: "256.0.0.0",
matched: false,
errFunc: func(t require.TestingT, err error, i ...interface{}) {
var dnsError *net.DNSError
if !errors.As(err, &dnsError) {
log.Fatal(err)
}
},
},
{
name: "非必应 ip",
ip: "166.249.90.77",
matched: false,
},
{
name: "必应 ip",
ip: "157.55.39.1",
matched: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
m, err := s.CheckCrawler(tc.ip)
if err != nil {
tc.errFunc(t, err)
}
require.Equal(t, tc.matched, m)
})
}
}
93 changes: 93 additions & 0 deletions internal/crawlerdetect/crawler_detector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

import (
"net"
"slices"
"strings"
)

const (
Baidu = "baidu"
Bing = "bing"
Google = "google"
Sogou = "sogou"
)

var strategyMap = map[string]Strategy{
Baidu: NewBaiduStrategy(),
Bing: NewBingStrategy(),
Google: NewGoogleStrategy(),
Sogou: NewSoGouStrategy(),
}

type Strategy interface {
CheckCrawler(ip string) (bool, error)
}

type UniversalStrategy struct {
Hosts []string
}

func NewUniversalStrategy(hosts []string) *UniversalStrategy {
return &UniversalStrategy{
Hosts: hosts,
}
}

func (s *UniversalStrategy) CheckCrawler(ip string) (bool, error) {
names, err := net.LookupAddr(ip)
if err != nil {
return false, err
}
if len(names) == 0 {
return false, nil
}

name, matched := s.matchHost(names)
if !matched {
return false, nil
}

ips, err := net.LookupIP(name)
if err != nil {
return false, err
}
if slices.ContainsFunc(ips, func(netIp net.IP) bool {
return netIp.String() == ip
}) {
return true, nil
}

return false, nil
}

func (s *UniversalStrategy) matchHost(names []string) (string, bool) {
var matchedName string
return matchedName, slices.ContainsFunc(s.Hosts, func(host string) bool {
return slices.ContainsFunc(names, func(name string) bool {
if strings.Contains(name, host) {
matchedName = name
return true
}
return false
})
})
}

func NewCrawlerDetector(crawler string) Strategy {
return strategyMap[crawler]
}
25 changes: 25 additions & 0 deletions internal/crawlerdetect/google_strategy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

type GoogleStrategy struct {
*UniversalStrategy
}

func NewGoogleStrategy() *GoogleStrategy {
return &GoogleStrategy{
UniversalStrategy: NewUniversalStrategy([]string{"googlebot.com", "google.com", "googleusercontent.com"}),
}
}
67 changes: 67 additions & 0 deletions internal/crawlerdetect/google_strategy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2023 ecodeclub
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawlerdetect

import (
"errors"
"log"
"net"
"testing"

"github.com/stretchr/testify/require"
)

func TestGoogleStrategy(t *testing.T) {
s := NewGoogleStrategy()
require.NotNil(t, s)
testCases := []struct {
name string
ip string
matched bool
errFunc require.ErrorAssertionFunc
}{
{
name: "无效 ip",
ip: "256.0.0.0",
matched: false,
errFunc: func(t require.TestingT, err error, i ...interface{}) {
var dnsError *net.DNSError
if !errors.As(err, &dnsError) {
log.Fatal(err)
}
},
},
{
name: "非谷歌 ip",
ip: "166.249.90.77",
matched: false,
},
{
name: "谷歌 ip",
ip: "66.249.90.77",
matched: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
m, err := s.CheckCrawler(tc.ip)
if err != nil {
tc.errFunc(t, err)
}
require.Equal(t, tc.matched, m)
})
}
}
Loading

0 comments on commit 1226b60

Please sign in to comment.