Skip to content

Commit

Permalink
1. 实现不去重的QueueScheduler
Browse files Browse the repository at this point in the history
2. Release 2.4.5
  • Loading branch information
邹嵩 committed Mar 27, 2018
1 parent 33d1889 commit 70dd94d
Show file tree
Hide file tree
Showing 20 changed files with 300 additions and 167 deletions.
12 changes: 6 additions & 6 deletions nuget/DotnetSpider.Core.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
<metadata>
<id>DotnetSpider2.Core</id>
<version>2.4.4</version>
<version>2.4.5</version>
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
<owners>[email protected]</owners>
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>
Expand All @@ -13,23 +13,23 @@
<description>A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling &amp; scraping framework for .NET</description>
<dependencies>
<group targetFramework=".NETStandard2.0">
<dependency id="Newtonsoft.Json" version="10.0.3"/>
<dependency id="Newtonsoft.Json" version="11.0.2"/>
<dependency id="NLog" version="5.0.0-beta09"/>
<dependency id="HtmlAgilityPack" version="1.6.15"/>
<dependency id="HtmlAgilityPack" version="1.7.2"/>
<dependency id="System.Threading.Tasks.Parallel" version="4.3.0"/>
<dependency id="System.Text.Encoding.CodePages" version="4.4.0"/>
<dependency id="System.Runtime.InteropServices.RuntimeInformation" version="4.3.0"/>
<dependency id="System.Diagnostics.Process" version="4.3.0"/>
<dependency id="System.Configuration.ConfigurationManager" version="4.4.1"/>
<dependency id="System.Data.SqlClient" version="4.4.2"/>
<dependency id="System.Data.SqlClient" version="4.4.3"/>
<dependency id="Microsoft.Extensions.DependencyModel" version="2.0.4"/>
<dependency id="System.Runtime.Loader" version="4.3.0"/>
<dependency id="System.Net.Ping" version="4.3.0"/>
<dependency id="Polly" version="5.8.0" />
</group>
<group targetFramework=".NETFramework4.5">
<dependency id="Newtonsoft.Json" version="10.0.3"/>
<dependency id="HtmlAgilityPack" version="1.6.15"/>
<dependency id="Newtonsoft.Json" version="11.0.2"/>
<dependency id="HtmlAgilityPack" version="1.7.2"/>
<dependency id="NLog" version="4.4.12"/>
<dependency id="Polly" version="5.8.0" />
</group>
Expand Down
22 changes: 11 additions & 11 deletions nuget/DotnetSpider.Extension.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
<metadata>
<id>DotnetSpider2.Extension</id>
<version>2.4.3</version>
<version>2.4.5</version>
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
<owners>[email protected]</owners>
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>
Expand All @@ -13,34 +13,34 @@
<description>A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling &amp; scraping framework for .NET</description>
<dependencies>
<group targetFramework=".NETStandard2.0">
<dependency id="DotnetSpider2.Core" version="2.4.3" />
<dependency id="DotnetSpider2.Core" version="2.4.5" />
<dependency id="Dapper" version="1.50.2"/>
<dependency id="MailKit" version="2.0.1"/>
<dependency id="MailKit" version="2.0.2"/>
<dependency id="MongoDB.Driver" version="2.5.0"/>
<dependency id="MySql.Data" version="6.10.6"/>
<dependency id="StackExchange.Redis" version="1.2.6" />
<dependency id="SSH.NET" version="2016.1.0" />
<dependency id="System.Runtime.Extensions" version="4.3.0"/>
<dependency id="EPPlus.Core" version="1.5.4"/>
<dependency id="Selenium.WebDriver" version="3.8.0"/>
<dependency id="Npgsql" version="3.2.6"/>
<dependency id="CassandraCSharpDriver" version="3.4.0.1"/>
<dependency id="Selenium.WebDriver" version="3.11.0"/>
<dependency id="Npgsql" version="3.2.7"/>
<dependency id="CassandraCSharpDriver" version="3.4.1"/>
<dependency id="MessagePack" version="1.7.3.4"/>
</group>
<group targetFramework=".NETFramework4.5" >
<dependency id="DotnetSpider2.Core" version="2.4.3" />
<dependency id="DotnetSpider2.Core" version="2.4.5" />
<dependency id="Dapper" version="1.50.2"/>
<dependency id="MailKit" version="2.0.1"/>
<dependency id="MailKit" version="2.0.2"/>
<dependency id="MongoDB.Driver" version="2.5.0"/>
<dependency id="MySql.Data" version="6.9.11"/>
<dependency id="StackExchange.Redis" version="1.2.6" />
<dependency id="FiddlerCore2" version="1.0.0"/>
<dependency id="SSH.NET" version="2016.1.0" />
<dependency id="DotRas.for.Win7" version="1.3.0" />
<dependency id="EPPlus" version="4.1.1"/>
<dependency id="Selenium.WebDriver" version="3.8.0"/>
<dependency id="Npgsql" version="3.2.6"/>
<dependency id="CassandraCSharpDriver" version="3.4.0.1"/>
<dependency id="Selenium.WebDriver" version="3.11.0"/>
<dependency id="Npgsql" version="3.2.7"/>
<dependency id="CassandraCSharpDriver" version="3.4.1"/>
<dependency id="MessagePack" version="1.7.3.4"/>
</group>
</dependencies>
Expand Down
2 changes: 1 addition & 1 deletion src/DotnetSpider.Core.Test/Processor/ProcessorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public void ProcesserException()
new TestPageProcessor())
// save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
.AddPipeline(new FilePipeline());

spider.ClearSchedulerAfterComplete = false;
// dowload html by http client
spider.Downloader = new HttpClientDownloader();

Expand Down
3 changes: 2 additions & 1 deletion src/DotnetSpider.Core.Test/SpiderTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void CloseSignal()
{
Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" },
new TestPageProcessor()).AddPipeline(new TestPipeline());

spider.ClearSchedulerAfterComplete = false;
for (int i = 0; i < 20; ++i)
{
spider.AddStartUrl($"http://www.baidu.com/_t={i}");
Expand All @@ -143,6 +143,7 @@ public void CloseSignal()

Spider spider2 = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" },
new TestPageProcessor()).AddPipeline(new TestPipeline());
spider2.ClearSchedulerAfterComplete = false;
for (int i = 0; i < 25; ++i)
{
spider2.AddStartUrl($"http://www.baidu.com/_t={i}");
Expand Down
2 changes: 2 additions & 0 deletions src/DotnetSpider.Core/DotnetSpider.Core.projitems
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
<Compile Include="$(MSBuildThisFileDirectory)Redial\RedialResult.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Request.cs" />
<Compile Include="$(MSBuildThisFileDirectory)ResultItems.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\BaseScheduler.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\BloomFilterDuplicateRemover.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\HashSetDuplicateRemover.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\IDuplicateRemover.cs" />
Expand All @@ -126,6 +127,7 @@
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\IScheduler.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\PriorityScheduler.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\QueueDuplicateRemovedScheduler.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\QueueScheduler.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Selector\HtmlSelector.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Selector\AbstractSelectable.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Selector\CssSelector.cs" />
Expand Down
120 changes: 120 additions & 0 deletions src/DotnetSpider.Core/Scheduler/BaseScheduler.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
using DotnetSpider.Core.Redial;
using System;
using System.Collections.Generic;

namespace DotnetSpider.Core.Scheduler
{
public abstract class BaseScheduler : Named, IScheduler, IDisposable
{
/// <summary>
/// 爬虫对象
/// </summary>
protected ISpider Spider { get; set; }

/// <summary>
/// 采集成功的链接数加 1
/// </summary>
public abstract void IncreaseSuccessCount();

/// <summary>
/// 采集失败的次数加 1
/// </summary>
public abstract void IncreaseErrorCount();

/// <summary>
/// 批量导入
/// </summary>
/// <param name="requests">请求对象</param>
public abstract void Import(IEnumerable<Request> requests);

/// <summary>
/// 是否会使用互联网
/// </summary>
protected abstract bool UseInternet { get; set; }

/// <summary>
/// 剩余链接数
/// </summary>
public abstract long LeftRequestsCount { get; }

/// <summary>
/// 总的链接数
/// </summary>
public virtual long TotalRequestsCount { get; }

/// <summary>
/// 采集成功的链接数
/// </summary>
public abstract long SuccessRequestsCount { get; }

/// <summary>
/// 采集失败的次数, 不是链接数, 如果一个链接采集多次都失败会记录多次
/// </summary>
public abstract long ErrorRequestsCount { get; }

/// <summary>
/// 是否深度优先
/// </summary>
public bool DepthFirst { get; set; } = true;

/// <summary>
/// 添加请求对象到队列
/// </summary>
/// <param name="request">请求对象</param>
public void Push(Request request)
{
if (UseInternet)
{
NetworkCenter.Current.Execute("sch-push", () =>
{
DoPush(request);
});
}
else
{
DoPush(request);
}
}

/// <summary>
/// 初始化队列
/// </summary>
/// <param name="spider">爬虫对象</param>
public virtual void Init(ISpider spider)
{
if (Spider == null)
{
Spider = spider;
}
else
{
throw new SpiderException("Scheduler already init");
}
}

/// <summary>
/// 取得一个需要处理的请求对象
/// </summary>
/// <returns>请求对象</returns>
public abstract Request Poll();

/// <summary>
/// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
/// </summary>
public abstract void Dispose();

/// <summary>
/// 导出整个队列
/// </summary>
public virtual void Export()
{
}

protected virtual bool ShouldReserved(Request request)
{
return request.CycleTriedTimes > 0 && request.CycleTriedTimes <= Spider.Site.CycleRetryTimes;
}

protected abstract void DoPush(Request request);
}
}
Loading

0 comments on commit 70dd94d

Please sign in to comment.