Skip to content

Commit

Permalink
重构了BasePageProcessor.
Browse files Browse the repository at this point in the history
  • Loading branch information
zlzforever committed Sep 27, 2018
1 parent e17955d commit 2c5bbdd
Show file tree
Hide file tree
Showing 48 changed files with 452 additions and 786 deletions.
9 changes: 5 additions & 4 deletions src/DotnetSpider.Core.Test/TargetRequestExtractorTest.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System.Linq;
using System.Net.Http;
using DotnetSpider.Core.Processor.TargetRequestExtractors;
using Xunit;
using DotnetSpider.Downloader;
using DotnetSpider.Core.Processor.RequestExtractor;

namespace DotnetSpider.Core.Test
{
Expand All @@ -14,12 +14,13 @@ public void RegionAndPatternTargetUrlsExtractor()
HttpClient client = new HttpClient();
var html = client.GetStringAsync("http://www.cnblogs.com").Result;

var extracotr = new RegionAndPatternTargetRequestExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$");
var extracotr = new XPathRequestExtractor(".//div[@class='pager']");
//, "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$"
var page = new Page(new Request("http://cnblogs.com"));
page.Content = html;
page.ContentType = ContentType.Html;
var requets = Enumerable.ToList(extracotr.ExtractRequests(page));
Assert.Equal(11, requets.Count);
var requets = Enumerable.ToList(extracotr.Extract(page));
Assert.Equal(12, requets.Count);
Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2");
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/DotnetSpider.Core/DotnetSpider.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<TargetFrameworks>net451;netstandard2.0</TargetFrameworks>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
<Version>3.0.4</Version>
<Version>3.0.5</Version>
<Authors>[email protected];</Authors>
<AssemblyName>DotnetSpider.Core</AssemblyName>
<Copyright>Copyright 2018 Lewis Zou</Copyright>
Expand Down Expand Up @@ -52,4 +52,4 @@
<PackageReference Include="Microsoft.Extensions.Logging" Version="2.1.1" />
<PackageReference Include="Serilog.Extensions.Logging" Version="2.0.2" />
</ItemGroup>
</Project>
</Project>
2 changes: 1 addition & 1 deletion src/DotnetSpider.Core/Env.cs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ public static class Env
/// <summary>
/// 配置PageProcessor是否对深度为1的链接进行正则筛选
/// </summary>
public static bool ProcessorFilterDefaultRequest = true;
public static bool FilterDefaultRequest = true;

/// <summary>
/// 任务唯一标识的最大长度限制
Expand Down
18 changes: 13 additions & 5 deletions src/DotnetSpider.Core/Page.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ public class Page : Response
/// </summary>
public bool Retry { get; set; }

/// <summary>
/// 对此页面跳过解析目标链接的操作
/// </summary>
public bool SkipExtractedTargetRequests { get; set; }

/// <summary>
/// 页面解析出来的目标链接不加入到调度队列中
/// </summary>
Expand Down Expand Up @@ -191,6 +186,19 @@ public void AddTargetRequest(Request request, bool increaseDeep = true)
}
}

public Dictionary<string, dynamic> CopyProperties()
{
var properties = new Dictionary<string, dynamic>();
foreach (var kv in Request.Properties)
{
if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey)
{
properties.Add(kv.Key, kv.Value);
}
}
return properties;
}

private bool IsAvailable(Request request)
{
if (request.Url == null)
Expand Down
79 changes: 45 additions & 34 deletions src/DotnetSpider.Core/Processor/BasePageProcessor.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
using Microsoft.Extensions.Logging;
using DotnetSpider.Downloader;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;

namespace DotnetSpider.Core.Processor
{
Expand All @@ -13,9 +16,20 @@ public abstract class BasePageProcessor : IPageProcessor
public ILogger Logger { get; set; }

/// <summary>
/// 目标链接的解析器、抽取器
/// 用于判断是否需要处理当前 Request, 以及解析出来的目标链接是否需要添加到队列.
/// RequestExtractor 解析出来的结果也需验证是否符合 Filter, 如果不符合 Filter 那么最终也不会进入到 Processor, 即为无意义的 Request
/// </summary>
public ITargetRequestExtractor TargetUrlsExtractor { get; set; }
public IFilter Filter { get; set; }

/// <summary>
/// 解析目标链接的接口
/// </summary>
public IRequestExtractor RequestExtractor { get; set; }

/// <summary>
/// 是否最后一页的判断接口, 如果是最后一页, 则不需要执行 RequestExtractor
/// </summary>
public ILastPageChecker LastPageChecker { get; set; }

/// <summary>
/// 去掉链接#后面的所有内容
Expand Down Expand Up @@ -43,54 +57,51 @@ public void Process(Page page)
properties[Env.UrlPropertyKey] = page.Request.Url;
properties[Env.TargetUrlPropertyKey] = page.TargetUrl;

if (TargetUrlsExtractor != null)
if (!(page.Request.GetProperty(Page.Depth) == 1 && !Env.FilterDefaultRequest))
{
bool isTarget = true;
if ((page.Request.GetProperty(Page.Depth) != 1 || Env.ProcessorFilterDefaultRequest) && TargetUrlsExtractor.TargetUrlPatterns != null && TargetUrlsExtractor.TargetUrlPatterns.Count > 0 && !TargetUrlsExtractor.TargetUrlPatterns.Contains(null))
{
foreach (var regex in TargetUrlsExtractor.TargetUrlPatterns)
{
isTarget = regex.IsMatch(page.Request.Url);
if (isTarget)
{
break;
}
}
}

if (!isTarget)
if (Filter != null && !Filter.IsMatch(page.Request))
{
return;
}
}

Handle(page);

// IAfterDownloaderHandler中可以实现解析, 有可能不再需要解析了
if (!page.SkipExtractedTargetRequests && TargetUrlsExtractor != null)
{
ExtractUrls(page);
}
}
if (LastPageChecker != null && LastPageChecker.IsLastPage(page)) return;

/// <summary>
/// 解析目标链接并添加到Page对象中, 供Spider对象添加到对列中
/// </summary>
/// <param name="page">页面数据</param>
protected virtual void ExtractUrls(Page page)
{
var links = TargetUrlsExtractor.ExtractRequests(page);
if (links != null)
IEnumerable<Request> requests;
if (RequestExtractor != null && (requests = RequestExtractor.Extract(page)) != null)
{
foreach (var link in links)
foreach (var link in requests)
{
if (Filter != null && !Filter.IsMatch(link)) continue;

if (CleanPound)
{
link.Url = link.Url.Split('#')[0];
}

page.AddTargetRequest(link);
}
}
}

public BasePageProcessor SetRequestExtractor(IRequestExtractor requestExtractor)
{
RequestExtractor = requestExtractor;
return this;
}

public BasePageProcessor SetFilter(IFilter filter)
{
Filter = filter;
return this;
}

public BasePageProcessor SetLastPageChecker(ILastPageChecker lastPageChecker)
{
LastPageChecker = lastPageChecker;
return this;
}
}
}
}
32 changes: 2 additions & 30 deletions src/DotnetSpider.Core/Processor/DefaultPageProcessor.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using DotnetSpider.Core.Processor.TargetRequestExtractors;
using System.Collections;
using System.Collections.Generic;

namespace DotnetSpider.Core.Processor
{
Expand All @@ -7,35 +8,6 @@ namespace DotnetSpider.Core.Processor
/// </summary>
public class DefaultPageProcessor : BasePageProcessor
{
/// <summary>
/// 构造方法
/// </summary>
/// <param name="partterns">匹配目标链接的正则表达式</param>
/// <param name="excludeParterns">排除目标链接的正则表达式</param>
public DefaultPageProcessor(string[] partterns = null, string[] excludeParterns = null)
{
var targetUrlsExtractor = new RegionAndPatternTargetRequestExtractor();
if (partterns != null && partterns.Length > 0)
{
targetUrlsExtractor.AddTargetUrlExtractor(".", partterns);
}
if (excludeParterns != null && excludeParterns.Length > 0)
{
targetUrlsExtractor.AddExcludeTargetUrlPatterns(excludeParterns);
}
TargetUrlsExtractor = targetUrlsExtractor;
}

/// <summary>
/// 添加目标链接解析规则
/// </summary>
/// <param name="regionXpath">目标链接所在区域</param>
/// <param name="patterns">匹配目标链接的正则表达式</param>
public void AddTargetUrlExtractor(string regionXpath, params string[] patterns)
{
(TargetUrlsExtractor as RegionAndPatternTargetRequestExtractor)?.AddTargetUrlExtractor(regionXpath, patterns);
}

/// <summary>
/// 解析页面数据
/// </summary>
Expand Down
58 changes: 58 additions & 0 deletions src/DotnetSpider.Core/Processor/Filter/PatternFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using DotnetSpider.Downloader;

namespace DotnetSpider.Core.Processor.Filter
{
public class PatternFilter : IFilter
{
private readonly List<string> _patterns;
private readonly List<string> _excludePaterns;

/// <summary>
/// 构造方法
/// </summary>
/// <param name="patterns">需要匹配的正则</param>
public PatternFilter(params string[] patterns) : this(patterns, null) { }

/// <summary>
/// 构造方法
/// </summary>
/// <param name="patterns">需要匹配的正则</param>
/// <param name="excludePatters">需要排除匹配的正则</param>
public PatternFilter(IEnumerable<string> patterns, IEnumerable<string> excludePatters = null)
{
_patterns = patterns == null ? new List<string>() : new List<string>(patterns);
_excludePaterns = excludePatters == null ? new List<string>() : new List<string>(excludePatters);
}

public bool IsMatch(Request request)
{
if (_patterns.Count == 0 && _excludePaterns.Count == 0) return true;

foreach (var pattern in _excludePaterns)
{
if (Regex.IsMatch(request.Url, pattern))
{
return false;
}
}

foreach (var pattern in _patterns)
{
if (Regex.IsMatch(request.Url, pattern))
{
return true;
}
}

return false;
}

internal bool ContainsPattern(string pattern)
{
return _patterns.Contains(pattern);
}
}
}
14 changes: 14 additions & 0 deletions src/DotnetSpider.Core/Processor/IFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using DotnetSpider.Downloader;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DotnetSpider.Core.Processor
{
public interface IFilter
{
bool IsMatch(Request request);
}
}
13 changes: 13 additions & 0 deletions src/DotnetSpider.Core/Processor/ILastPageChecker.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DotnetSpider.Core.Processor
{
public interface ILastPageChecker
{
bool IsLastPage(Page page);
}
}
14 changes: 14 additions & 0 deletions src/DotnetSpider.Core/Processor/IRequestExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using DotnetSpider.Downloader;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DotnetSpider.Core.Processor
{
public interface IRequestExtractor
{
IEnumerable<Request> Extract(Page page);
}
}
Loading

0 comments on commit 2c5bbdd

Please sign in to comment.