diff --git a/README.md b/README.md index 6a7712474..61633fec2 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,10 @@ DotnetSpider, a .NET Standard web crawling library similar to WebMagic and Scrap ### OPTIONAL ENVIROMENT -- Run distributed crawler. [Download Redis for windows](https://github.com/MSOpenTech/redis/releases) +- Distributed crawler. [Download Redis for windows](https://github.com/MSOpenTech/redis/releases) - SqlServer. - PostgreSQL. - MongoDb -- Cassandra ### MORE DOCUMENTS @@ -53,7 +52,7 @@ https://github.com/dotnetcore/DotnetSpider/wiki private class Spider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary { { "Keyword", word } }); @@ -130,15 +129,12 @@ NOTE: ### Storage log and status to database -1. Set SystemConnection in app.config -2. Update nlog.config like https://github.com/dotnetcore/DotnetSpider/blob/master/src/DotnetSpider.Extension.Test/nlog.config - -### Web Manager +### DotnetSpider.Hub https://github.com/zlzforever/DotnetSpider.Hub -1. Dependences a ci platform forexample i used gitlab-ci right now. +1. Dependences a ci platform forexample i used teamcity right now. 2. Dependences Sceduler.NET https://github.com/zlzforever/Scheduler.NET 3. More documents continue... diff --git a/nuget/DotnetSpider.Core.nuspec b/nuget/DotnetSpider.Core.nuspec deleted file mode 100644 index 79139a752..000000000 --- a/nuget/DotnetSpider.Core.nuspec +++ /dev/null @@ -1,55 +0,0 @@ - - - - DotnetSpider.Core - 2.6.0-beta7 - zlzforever@163.com;Walterwhatwater; - zlzforever@163.com - https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true - https://raw.githubusercontent.com/zlzforever/DotnetSpider/master/LICENSE - https://github.com/zlzforever/DotnetSpider - true - DotnetSpider;crawler;cross platform;dotnet core - A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling & scraping framework for .NET - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/nuget/DotnetSpider.Extension.nuspec b/nuget/DotnetSpider.Extension.nuspec deleted file mode 100644 index 9ec97254e..000000000 --- a/nuget/DotnetSpider.Extension.nuspec +++ /dev/null @@ -1,56 +0,0 @@ - - - - DotnetSpider.Extension - 2.6.0-beta5 - zlzforever@163.com;Walterwhatwater; - zlzforever@163.com - https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true - https://raw.githubusercontent.com/zlzforever/DotnetSpider/master/LICENSE - https://github.com/zlzforever/DotnetSpider - true - DotnetSpider;crawler;cross platform;dotnet core - A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling & scraping framework for .NET - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/DotnetSpider.Common.Test/RequestTest.cs b/src/DotnetSpider.Common.Test/RequestTest.cs index 0ba99e45a..33ca73c71 100644 --- a/src/DotnetSpider.Common.Test/RequestTest.cs +++ b/src/DotnetSpider.Common.Test/RequestTest.cs @@ -1,5 +1,6 @@ using Newtonsoft.Json; using System.Collections.Generic; +using System.Net.Http; using Xunit; namespace DotnetSpider.Common.Test @@ -24,6 +25,7 @@ public void Request() Assert.Single(request.Properties); Assert.Equal(request.Properties["Test"], "Forever"); } + [Fact(DisplayName = "Request_PutExtra")] public void PutExtra() { diff --git a/src/DotnetSpider.Common/DotnetSpider.Common.csproj b/src/DotnetSpider.Common/DotnetSpider.Common.csproj index 6ffe79631..9b260ba56 100644 --- a/src/DotnetSpider.Common/DotnetSpider.Common.csproj +++ b/src/DotnetSpider.Common/DotnetSpider.Common.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Common Copyright 2018 Lewis Zou @@ -18,9 +18,11 @@ + + diff --git a/src/DotnetSpider.Common/HttpMethod.cs b/src/DotnetSpider.Common/HttpMethod.cs deleted file mode 100644 index deeec9e44..000000000 --- a/src/DotnetSpider.Common/HttpMethod.cs +++ /dev/null @@ -1,18 +0,0 @@ -using Newtonsoft.Json; -using Newtonsoft.Json.Converters; - -namespace DotnetSpider.Common -{ - [JsonConverter(typeof(StringEnumConverter))] - public enum HttpMethod - { - Get, - Post, - Put, - Delete, - Head, - Options, - Patch, - Trace - } -} \ No newline at end of file diff --git a/src/DotnetSpider.Common/Request.cs b/src/DotnetSpider.Common/Request.cs index 574597cb5..19edfcbb9 100644 --- a/src/DotnetSpider.Common/Request.cs +++ b/src/DotnetSpider.Common/Request.cs @@ -1,6 +1,8 @@ using System; using System.Collections.Generic; +using System.Net.Http; using Newtonsoft.Json; +using Newtonsoft.Json.Converters; namespace DotnetSpider.Common { @@ -32,7 +34,7 @@ public class Request : IDisposable /// /// 请求链接的方法 /// - public HttpMethod Method { get; set; } + public HttpMethod Method { get; set; } = HttpMethod.Get; /// /// 链接的优先级, 仅用于优先级队列 diff --git a/src/DotnetSpider.Common/TaskName.cs b/src/DotnetSpider.Common/TaskName.cs index b72cc9a4e..b1b5b1e62 100644 --- a/src/DotnetSpider.Common/TaskName.cs +++ b/src/DotnetSpider.Common/TaskName.cs @@ -5,7 +5,7 @@ namespace DotnetSpider.Common /// /// 任务名称 /// - [AttributeUsage(AttributeTargets.Class, AllowMultiple = false, Inherited = false)] + [AttributeUsage(AttributeTargets.Class, Inherited = false)] public class TaskName : Attribute { /// @@ -14,7 +14,6 @@ public class TaskName : Attribute public string Name { get; - private set; } /// diff --git a/src/DotnetSpider.Core.Test/PageTest.cs b/src/DotnetSpider.Core.Test/PageTest.cs index ed38b26a0..e6b844a29 100644 --- a/src/DotnetSpider.Core.Test/PageTest.cs +++ b/src/DotnetSpider.Core.Test/PageTest.cs @@ -1,6 +1,7 @@ using DotnetSpider.Common; using System.Collections.Generic; using System.Linq; +using System.Net.Http; using Xunit; namespace DotnetSpider.Core.Test diff --git a/src/DotnetSpider.Core.Test/SpiderTest.cs b/src/DotnetSpider.Core.Test/SpiderTest.cs index 9d2ddb5ac..6e6febc87 100644 --- a/src/DotnetSpider.Core.Test/SpiderTest.cs +++ b/src/DotnetSpider.Core.Test/SpiderTest.cs @@ -153,7 +153,7 @@ public void WhenNoStartUrl() internal class TestPipeline : BasePipeline { - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { foreach (var resultItem in resultItems) { @@ -276,7 +276,7 @@ protected override Response DowloadContent(Request request) internal class FastExitPipeline : BasePipeline { - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { File.AppendAllLines("FastExit_Result.txt", new[] { resultItems.First().Request.Url.ToString() }); } diff --git a/src/DotnetSpider.Core/DotnetSpider.Core.csproj b/src/DotnetSpider.Core/DotnetSpider.Core.csproj index 1fbe2372e..db05b2f50 100644 --- a/src/DotnetSpider.Core/DotnetSpider.Core.csproj +++ b/src/DotnetSpider.Core/DotnetSpider.Core.csproj @@ -1,10 +1,9 @@  - net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Core Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Core/Env.cs b/src/DotnetSpider.Core/Env.cs index 39ff01e9b..941c5375c 100644 --- a/src/DotnetSpider.Core/Env.cs +++ b/src/DotnetSpider.Core/Env.cs @@ -16,11 +16,6 @@ namespace DotnetSpider.Core /// public static class Env { - /// - /// TODO: 原则上此版本号与Nuget包版本号同步, 但是不知道有什么好的自动化更新方法 - /// - public const string Version = "2.0.21"; - /// /// 从配置文件中读取默认Redis连接字符串的关键字 /// diff --git a/src/DotnetSpider.Core/ISpider.cs b/src/DotnetSpider.Core/ISpider.cs index 84bb103d3..70a5efe5f 100644 --- a/src/DotnetSpider.Core/ISpider.cs +++ b/src/DotnetSpider.Core/ISpider.cs @@ -9,7 +9,7 @@ namespace DotnetSpider.Core /// /// 爬虫接口定义 /// - public interface ISpider : IDisposable, IControllable, IAppBase + public interface ISpider : IDisposable, IAppBase { /// /// 采集站点的信息配置 diff --git a/src/DotnetSpider.Core/Infrastructure/Description.cs b/src/DotnetSpider.Core/Infrastructure/Description.cs index 28f613229..d54fa43bc 100644 --- a/src/DotnetSpider.Core/Infrastructure/Description.cs +++ b/src/DotnetSpider.Core/Infrastructure/Description.cs @@ -11,26 +11,26 @@ public class Description : Attribute /// /// 任务所有者 /// - public string Owner; + public string Owner { get; set; } /// /// 程序的开发者 /// - public string Developer; + public string Developer{ get; set; } /// /// 程序的开发时间 /// - public string Date; + public string Date{ get; set; } /// /// 任务主题 /// - public string Subject; + public string Subject{ get; set; } /// /// 联系邮箱 /// - public string Email; + public string Email{ get; set; } } } \ No newline at end of file diff --git a/src/DotnetSpider.Core/Infrastructure/EncodingExtensions.cs b/src/DotnetSpider.Core/Infrastructure/EncodingExtensions.cs deleted file mode 100644 index 7a2129e0f..000000000 --- a/src/DotnetSpider.Core/Infrastructure/EncodingExtensions.cs +++ /dev/null @@ -1,126 +0,0 @@ -using System; -using System.Text; -using System.Text.RegularExpressions; - -namespace DotnetSpider.Core.Infrastructure -{ - /// - /// 编码扩展 - /// - public static class EncodingExtensions - { - private const int Utf8PreambleLength = 3; - private const byte Utf8PreambleByte2 = 0xBF; - private const int Utf8PreambleFirst2Bytes = 0xEFBB; - - // UTF32 not supported on Phone - private const int Utf32PreambleLength = 4; - private const byte Utf32PreambleByte2 = 0x00; - private const byte Utf32PreambleByte3 = 0x00; - private const int Utf32OrUnicodePreambleFirst2Bytes = 0xFFFE; - private const int BigEndianUnicodePreambleFirst2Bytes = 0xFEFF; - - /// - /// 检测编码类型 - /// - /// 编码名称 - /// 被检测的编码 - /// 编码类型 - public static Encoding GetEncoding(string characterSet, byte[] bytes) - { - if (!string.IsNullOrWhiteSpace(characterSet)) - { - return GetEncoding(characterSet); - } - else - { - Match meta = Regex.Match(Encoding.UTF8.GetString(bytes), " 0) - { - c = meta.Groups[1].Value.ToLower().Trim(); - } - if (c.Length > 2) - { - try - { - return Encoding.GetEncoding(c.Replace("\"", string.Empty).Replace("'", "").Replace(";", "").Replace("iso-8859-1", "gbk").Trim()); - } - catch - { - var buffer = new ArraySegment(bytes); - return DetectEncoding(buffer); - } - } - else - { - var buffer = new ArraySegment(bytes); - return DetectEncoding(buffer); - } - } - } - - private static Encoding GetEncoding(string contentType) - { - var encodingName = contentType.ToLower(); - if (encodingName.Contains("gb2312")) - { - return Encoding.GetEncoding("GB2312"); - } - else if (encodingName.Contains("gbk")) - { - return Encoding.GetEncoding("GBK"); - } - else if (encodingName.Contains("utf-8") || encodingName.Contains("utf8")) - { - return Encoding.UTF8; - } - else - { - return Encoding.UTF8; - } - } - - private static Encoding DetectEncoding(ArraySegment buffer) - { - byte[] data = buffer.Array; - int offset = buffer.Offset; - int dataLength = buffer.Count; - - - if (dataLength >= 2 && data != null) - { - int first2Bytes = data[offset + 0] << 8 | data[offset + 1]; - - switch (first2Bytes) - { - case Utf8PreambleFirst2Bytes: - if (dataLength >= Utf8PreambleLength && data[offset + 2] == Utf8PreambleByte2) - { - return Encoding.UTF8; - } - break; - - case Utf32OrUnicodePreambleFirst2Bytes: -#if !NETNative - // UTF32 not supported on Phone - if (dataLength >= Utf32PreambleLength && data[offset + 2] == Utf32PreambleByte2 && data[offset + 3] == Utf32PreambleByte3) - { - return Encoding.UTF32; - } - else -#endif - { - return Encoding.Unicode; - } - - case BigEndianUnicodePreambleFirst2Bytes: - return Encoding.BigEndianUnicode; - - } - } - - return Encoding.UTF8; - } - } -} diff --git a/src/DotnetSpider.Core/Monitor/LogMonitor.cs b/src/DotnetSpider.Core/Monitor/LogMonitor.cs index 9a327b994..c3f5f892b 100644 --- a/src/DotnetSpider.Core/Monitor/LogMonitor.cs +++ b/src/DotnetSpider.Core/Monitor/LogMonitor.cs @@ -9,7 +9,6 @@ public class LogMonitor : IMonitor { public ILogger Logger{ get; set; } - /// /// 上报爬虫状态 /// diff --git a/src/DotnetSpider.Core/Pipeline/BasePipeline.cs b/src/DotnetSpider.Core/Pipeline/BasePipeline.cs index e4f7b07ec..f40d6b2fc 100644 --- a/src/DotnetSpider.Core/Pipeline/BasePipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/BasePipeline.cs @@ -14,7 +14,7 @@ public abstract class BasePipeline : IPipeline /// 数据结果 /// 日志接口 /// 调用方 - public abstract void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null); + public abstract void Process(IList resultItems, ILogger logger, dynamic sender = null); /// /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources. @@ -22,5 +22,21 @@ public abstract class BasePipeline : IPipeline public virtual void Dispose() { } + + protected string GetIdentity(dynamic sender) + { + if (sender == null) + { + throw new SpiderException("Sender should not be null."); + } + try + { + return sender.Identity; + } + catch + { + throw new SpiderException("Sender should be a IIdentity object."); + } + } } -} +} \ No newline at end of file diff --git a/src/DotnetSpider.Core/Pipeline/CollectionPipeline.cs b/src/DotnetSpider.Core/Pipeline/CollectionPipeline.cs index 16262a6b2..42fc6c2d2 100644 --- a/src/DotnetSpider.Core/Pipeline/CollectionPipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/CollectionPipeline.cs @@ -16,7 +16,7 @@ public class CollectionPipeline : BasePipeline, ICollectionPipeline /// /// 数据拥有者 /// 数据结果 - public IEnumerable GetCollection(dynamic owner) + public IList GetCollection(dynamic owner) { lock (ItemsLocker) { @@ -36,16 +36,18 @@ public IEnumerable GetCollection(dynamic owner) /// /// 数据结果 /// 日志接口 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender) + /// 调用方 + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { + var identity = GetIdentity(sender); lock (ItemsLocker) { - if (!_items.ContainsKey(sender)) + if (!_items.ContainsKey(identity)) { - _items.Add(sender, new List()); + _items.Add(identity, new List()); } - _items[sender].AddRange(resultItems); + _items[identity].AddRange(resultItems); } } } diff --git a/src/DotnetSpider.Core/Pipeline/ConsolePipeline.cs b/src/DotnetSpider.Core/Pipeline/ConsolePipeline.cs index fe00c8b68..4166c15ad 100644 --- a/src/DotnetSpider.Core/Pipeline/ConsolePipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/ConsolePipeline.cs @@ -14,7 +14,7 @@ public class ConsolePipeline : BasePipeline /// 数据结果 /// 日志接口 /// 调用方 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { foreach (var resultItem in resultItems) { diff --git a/src/DotnetSpider.Core/Pipeline/FilePipeline.cs b/src/DotnetSpider.Core/Pipeline/FilePipeline.cs index 41e8a50d7..439b49656 100644 --- a/src/DotnetSpider.Core/Pipeline/FilePipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/FilePipeline.cs @@ -32,7 +32,7 @@ public FilePipeline(string interval) : base(interval) /// 数据结果 /// 日志接口 /// 调用方 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { try { diff --git a/src/DotnetSpider.Core/Pipeline/ICollectionPipeline.cs b/src/DotnetSpider.Core/Pipeline/ICollectionPipeline.cs index 274508550..14693e10b 100644 --- a/src/DotnetSpider.Core/Pipeline/ICollectionPipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/ICollectionPipeline.cs @@ -12,6 +12,6 @@ public interface ICollectionPipeline : IPipeline /// /// 数据拥有者 /// All results collected - IEnumerable GetCollection(dynamic owner); + IList GetCollection(dynamic owner); } } diff --git a/src/DotnetSpider.Core/Pipeline/IPipeline.cs b/src/DotnetSpider.Core/Pipeline/IPipeline.cs index 6353175c7..7b4f623b0 100644 --- a/src/DotnetSpider.Core/Pipeline/IPipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/IPipeline.cs @@ -15,6 +15,6 @@ public interface IPipeline : IDisposable /// 数据结果 /// 日志接口 /// 调用方 - void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null); + void Process(IList resultItems, ILogger logger, dynamic sender = null); } } \ No newline at end of file diff --git a/src/DotnetSpider.Core/Pipeline/JsonFilePipeline.cs b/src/DotnetSpider.Core/Pipeline/JsonFilePipeline.cs index f5357d815..eca657505 100644 --- a/src/DotnetSpider.Core/Pipeline/JsonFilePipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/JsonFilePipeline.cs @@ -36,9 +36,9 @@ public JsonFilePipeline(string interval) : base(interval) /// 数据结果 /// 日志接口 /// 调用方 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { - var jsonFile = Path.Combine(GetDataFolder(sender), $"{sender.Identity}.json"); + var jsonFile = Path.Combine(GetDataFolder(sender), $"{GetIdentity(sender)}.json"); try { var streamWriter = GetStreamWriter(jsonFile); @@ -53,7 +53,7 @@ public override void Process(IEnumerable resultItems, ILogger logge catch (Exception e) { logger.Error($"Storage data to file {jsonFile} failed: {e}."); - throw e; + throw; } } diff --git a/src/DotnetSpider.Core/Pipeline/SilentPipeline.cs b/src/DotnetSpider.Core/Pipeline/SilentPipeline.cs index 24db439da..cb2c8e92c 100644 --- a/src/DotnetSpider.Core/Pipeline/SilentPipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/SilentPipeline.cs @@ -9,7 +9,7 @@ public void Dispose() { } - public void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public void Process(IList resultItems, ILogger logger, dynamic sender = null) { } } diff --git a/src/DotnetSpider.Core/Processor/BasePageProcessor.cs b/src/DotnetSpider.Core/Processor/BasePageProcessor.cs index 237c6343b..b13711e02 100644 --- a/src/DotnetSpider.Core/Processor/BasePageProcessor.cs +++ b/src/DotnetSpider.Core/Processor/BasePageProcessor.cs @@ -59,7 +59,6 @@ public void Process(Page page, ILogger logger) /// 解析目标链接并添加到Page对象中, 供Spider对象添加到对列中 /// /// 页面数据 - /// 爬虫对象 protected virtual void ExtractUrls(Page page) { var links = TargetUrlsExtractor.ExtractRequests(page); diff --git a/src/DotnetSpider.Core/Processor/SimplePageProcessor.cs b/src/DotnetSpider.Core/Processor/SimplePageProcessor.cs index 1936dac25..d8c3f74dd 100644 --- a/src/DotnetSpider.Core/Processor/SimplePageProcessor.cs +++ b/src/DotnetSpider.Core/Processor/SimplePageProcessor.cs @@ -11,7 +11,6 @@ public class SimplePageProcessor : BasePageProcessor /// 页面数据 protected override void Handle(Page page) { - page.AddResultItem("title", page.Selectable().XPath("//title")); page.AddResultItem("html", page.Content); } } diff --git a/src/DotnetSpider.Core/Processor/TargetRequestExtractors/AutoIncrementTargetRequestExtractor.cs b/src/DotnetSpider.Core/Processor/TargetRequestExtractors/AutoIncrementTargetRequestExtractor.cs index ba7d188de..65367037b 100644 --- a/src/DotnetSpider.Core/Processor/TargetRequestExtractors/AutoIncrementTargetRequestExtractor.cs +++ b/src/DotnetSpider.Core/Processor/TargetRequestExtractors/AutoIncrementTargetRequestExtractor.cs @@ -6,7 +6,7 @@ namespace DotnetSpider.Core.Processor.TargetRequestExtractors { /// - /// 通过自增计算出新的目标链接, 比如: www.a.com/1.html->www.a.com/2.html + /// 通过自增计算出新的目标链接, 比如: www.a.com/1.html->www.a.com/2.html /// public class AutoIncrementTargetRequestExtractor : TargetRequestExtractor { @@ -36,8 +36,7 @@ public AutoIncrementTargetRequestExtractor(string paginationStr, int interval = /// /// 解析出目标链接 /// - /// 页面数据 - /// 站点信息 + /// 页面数据 /// 目标链接 protected override IEnumerable Extract(Response response) { diff --git a/src/DotnetSpider.Core/Processor/TargetRequestExtractors/RegionAndPatternTargetRequestExtractor.cs b/src/DotnetSpider.Core/Processor/TargetRequestExtractors/RegionAndPatternTargetRequestExtractor.cs index c71b9853c..5f9cd354b 100644 --- a/src/DotnetSpider.Core/Processor/TargetRequestExtractors/RegionAndPatternTargetRequestExtractor.cs +++ b/src/DotnetSpider.Core/Processor/TargetRequestExtractors/RegionAndPatternTargetRequestExtractor.cs @@ -15,8 +15,6 @@ public sealed class RegionAndPatternTargetRequestExtractor : TargetRequestExtrac private readonly Dictionary> _regionSelectorMapPatterns = new Dictionary>(); - private static readonly ISelector ImageSelector = Selectors.XPath(".//img/@src"); - /// /// 构造方法 /// @@ -38,7 +36,6 @@ public RegionAndPatternTargetRequestExtractor(string regionXpath, params string[ /// 解析出目标链接 /// /// 链接请求结果 - /// 站点信息 /// 目标链接 protected override IEnumerable Extract(Response response) { @@ -56,7 +53,7 @@ protected override IEnumerable Extract(Response response) continue; } - List requests = null; + List requests; if (response.ContentType == ContentType.Json) { @@ -67,7 +64,7 @@ protected override IEnumerable Extract(Response response) requests = new List(response.Selectable().SelectList(targetUrlExtractor.Key).Links().GetValues()); } - if (requests == null || requests.Count == 0) + if (requests.Count == 0) { continue; } diff --git a/src/DotnetSpider.Core/ResponseExtensions.cs b/src/DotnetSpider.Core/ResponseExtensions.cs index 16c08731d..828a96afe 100644 --- a/src/DotnetSpider.Core/ResponseExtensions.cs +++ b/src/DotnetSpider.Core/ResponseExtensions.cs @@ -8,12 +8,11 @@ public static class ResponseExtensions public static Selectable Selectable(this Response response) { var site = response.Request.Site; - var selectable = (response.Delivery != null && response.Delivery is Selectable) ? (Selectable)response.Delivery : + response.Delivery = response.Delivery != null && response.Delivery is Selectable ? response.Delivery : response.ContentType == ContentType.Json ? new Selectable(response.Content, site.Padding) : new Selectable(response.Content, response.Request.Url, response.Request.Site.Domains); - response.Delivery = selectable; - selectable.Properties = response.Request.Properties; - return selectable; + response.Delivery.Properties = response.Request.Properties; + return response.Delivery; } public static Page ToPage(this Response response) diff --git a/src/DotnetSpider.Core/ResultItems.cs b/src/DotnetSpider.Core/ResultItems.cs index bb0108724..2a520d21d 100644 --- a/src/DotnetSpider.Core/ResultItems.cs +++ b/src/DotnetSpider.Core/ResultItems.cs @@ -15,11 +15,6 @@ public class ResultItems /// public readonly Dictionary Results = new Dictionary(); - /// - /// 数据传递 - /// - public dynamic Object { get; set; } - /// /// 对应的目标链接信息 /// diff --git a/src/DotnetSpider.Core/Scheduler/DuplicateRemovedScheduler.cs b/src/DotnetSpider.Core/Scheduler/DuplicateRemovedScheduler.cs index 37f49a3f6..bdc13ab59 100644 --- a/src/DotnetSpider.Core/Scheduler/DuplicateRemovedScheduler.cs +++ b/src/DotnetSpider.Core/Scheduler/DuplicateRemovedScheduler.cs @@ -44,13 +44,12 @@ public abstract class DuplicateRemovedScheduler : Named, IScheduler /// 批量导入 /// /// 请求对象 - public abstract void Reload(IEnumerable requests); + public abstract void Reload(ICollection requests); /// /// 如果链接不是重复的就添加到队列中 /// /// 请求对象 - /// 请求对象的分组 protected abstract void PushWhenNoDuplicate(Request request); /// @@ -96,11 +95,12 @@ public int Depth /// 添加请求对象到队列 /// /// 请求对象 + /// 是否需要重试判断方法 public void Push(Request request, Func shouldReserved = null) { var action = new Action(() => { - if (!DuplicateRemover.IsDuplicate(request) || (shouldReserved != null && shouldReserved(request))) + if (!DuplicateRemover.IsDuplicate(request) || shouldReserved != null && shouldReserved(request)) { PushWhenNoDuplicate(request); } diff --git a/src/DotnetSpider.Core/Scheduler/IScheduler.cs b/src/DotnetSpider.Core/Scheduler/IScheduler.cs index d1a71981c..ca33fe866 100644 --- a/src/DotnetSpider.Core/Scheduler/IScheduler.cs +++ b/src/DotnetSpider.Core/Scheduler/IScheduler.cs @@ -39,7 +39,7 @@ public interface IScheduler : IDisposable, IMonitorable /// 批量导入 /// /// 请求对象 - void Reload(IEnumerable requests); + void Reload(ICollection requests); /// /// 导出整个队列 diff --git a/src/DotnetSpider.Core/Scheduler/QueueDuplicateRemovedScheduler.cs b/src/DotnetSpider.Core/Scheduler/QueueDuplicateRemovedScheduler.cs index 84543b4d2..f56a8ee1f 100644 --- a/src/DotnetSpider.Core/Scheduler/QueueDuplicateRemovedScheduler.cs +++ b/src/DotnetSpider.Core/Scheduler/QueueDuplicateRemovedScheduler.cs @@ -133,7 +133,7 @@ public override void IncreaseErrorCount() /// 批量导入 /// /// 请求对象 - public override void Reload(IEnumerable requests) + public override void Reload(ICollection requests) { if (requests == null) { diff --git a/src/DotnetSpider.Core/Spider.cs b/src/DotnetSpider.Core/Spider.cs index a62b77327..ac0c36f07 100644 --- a/src/DotnetSpider.Core/Spider.cs +++ b/src/DotnetSpider.Core/Spider.cs @@ -37,7 +37,7 @@ public class Spider : AppBase, ISpider, ISpeedMonitor { private Site _site = new Site(); private IScheduler _scheduler = new QueueDuplicateRemovedScheduler(); - private IDownloader _downloader = new HttpWebRequestDownloader(); + private IDownloader _downloader = new HttpClientDownloader(); private List _cached; private int _waitCountLimit = 1500; private bool _inited; @@ -48,9 +48,8 @@ public class Spider : AppBase, ISpider, ISpeedMonitor private int _pipelineCachedSize = 1; private RetryPolicy _pipelineRetry; private readonly AutomicLong _requestedCount = new AutomicLong(0); - private MemoryMappedFile _identityMmf; - private MemoryMappedFile _taskIdMmf; - private readonly string[] _closeSignalFiles = new string[2]; + private MemoryMappedFile[] _mmfCloseSignals = new MemoryMappedFile[2]; + private readonly string[] _filecloseSignals = new string[2]; private bool _exited; private IMonitor _monitor; private int _statusFlushInterval = 5000; @@ -63,6 +62,14 @@ public class Spider : AppBase, ISpider, ISpeedMonitor private long _pipelineCostTimes; private long _processorCostTimes; + /// + /// 自定义的初始化 + /// + /// 运行参数 + protected virtual void OnInit(params string[] arguments) + { + } + /// /// 是否需要通过StartUrlsBuilder来初始化起始链接 /// @@ -406,10 +413,10 @@ public bool SkipTargetRequestsWhenResultIsEmpty public Spider(Site site, string identity, IScheduler scheduler, IEnumerable pageProcessors, IEnumerable pipelines) { + ThreadPool.SetMinThreads(256, 256); #if NETSTANDARD - Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #else - ThreadPool.SetMinThreads(256, 256); ServicePointManager.DefaultConnectionLimit = 1000; #endif Site = site; @@ -704,6 +711,13 @@ protected override void Execute(params string[] arguments) return; } + Logger.Information("Oninit..."); + + NetworkCenter.Current.Execute("onInit", () => + { + OnInit(arguments); + }); + CheckSettings(); InitComponents(arguments); @@ -876,7 +890,7 @@ internal void SendExitSignal() } else { - File.Create(_closeSignalFiles[0]); + File.Create(_filecloseSignals[0]); } } @@ -947,14 +961,13 @@ protected void CheckSettings() /// protected virtual void InitComponents(params string[] arguments) { - Logger.Information("Init internal component..."); + Logger.Information("Init components..."); if (Site.Headers == null) { Site.Headers = new Dictionary(); } - Site.Accept = Site.Accept ?? "application/json, text/javascript, */*; q=0.01"; Site.UserAgent = Site.UserAgent ?? "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"; if (!Site.Headers.ContainsKey("Accept-Language")) @@ -969,24 +982,24 @@ protected virtual void InitComponents(params string[] arguments) ResetScheduler(); } - Downloader = Downloader ?? new HttpWebRequestDownloader(); + Downloader = Downloader ?? new HttpClientDownloader(); if (PageProcessors == null || PageProcessors.Count == 0) { - throw new ArgumentException("PageProcessor unfound."); + throw new ArgumentException("There is no usable pipeline."); } - InitPipelines(arguments); + PreparePipelines(arguments); - InitCloseSignals(); + MonitorCloseSignals(); Monitor = Monitor ?? (string.IsNullOrWhiteSpace(Env.HubServiceUrl) ? new LogMonitor() : new HttpMonitor()); _failingRequestsLogger = LogUtil.CreateFailingRequestsLogger(Identity); - RunStartUrlBuilders(arguments); + RunRequestBuilders(arguments); - LoadScheduler(); + PrepareScheduler(); _monitorFlushInterval = CalculateMonitorFlushInterval(); @@ -1039,9 +1052,13 @@ protected void OnDispose() SafeDestroy(PageProcessors); SafeDestroy(Downloader); - //SafeDestroy(Site.HttpProxyPool); - SafeDestroy(_identityMmf); - SafeDestroy(_taskIdMmf); + if (Env.IsWindows) + { + foreach (var mmf in _mmfCloseSignals) + { + SafeDestroy(mmf); + } + } } /// @@ -1290,7 +1307,7 @@ protected override void CheckIfRunning() /// 初始化数据管道 /// /// 运行参数 - protected virtual void InitPipelines(params string[] arguments) + protected virtual void PreparePipelines(params string[] arguments) { _cached = new List(PipelineCachedSize); @@ -1306,11 +1323,10 @@ protected virtual void InitPipelines(params string[] arguments) { Pipelines.Add(defaultPipeline); } - } - - if (Pipelines.Count == 0) - { - throw new SpiderException("Pipeline unfound."); + else + { + throw new SpiderException("Count of pipelines should larger than one."); + } } } @@ -1322,7 +1338,7 @@ private void WaitNewUrl(ref int waitCount) private void SafeDestroy(object obj) { - if (obj is IDisposable disposable) + if (obj != null && obj is IDisposable disposable) { try { @@ -1369,7 +1385,7 @@ private void CalculatePipelineSpeed(long time) AvgPipelineSpeed = _pipelineCostTimes / _pipelineTimes; } - private void RunStartUrlBuilders(params string[] arguments) + private void RunRequestBuilders(params string[] arguments) { if (RequestBuilders != null && RequestBuilders.Count > 0 && IfRequireRunRequestBuilders(arguments)) { @@ -1378,7 +1394,7 @@ private void RunStartUrlBuilders(params string[] arguments) for (int i = 0; i < RequestBuilders.Count; ++i) { var builder = RequestBuilders[i]; - Logger.Information($"Add start urls via builder[{i + 1}]."); + Logger.Information($"Add start request via builder[{i + 1}]."); builder.Build(Site); } } @@ -1389,7 +1405,7 @@ private void RunStartUrlBuilders(params string[] arguments) } } - private void LoadScheduler() + private void PrepareScheduler() { if (Site.Requests != null && Site.Requests.Any()) { @@ -1415,37 +1431,37 @@ private void LoadScheduler() } } - private void InitCloseSignals() + private void MonitorCloseSignals() { if (Env.IsWindows) { - _identityMmf = MemoryMappedFile.CreateOrOpen(Identity, 1, MemoryMappedFileAccess.ReadWrite); - using (MemoryMappedViewStream stream = _identityMmf.CreateViewStream()) + _mmfCloseSignals[0] = MemoryMappedFile.CreateOrOpen(Identity, 1, MemoryMappedFileAccess.ReadWrite); + if (!string.IsNullOrWhiteSpace(TaskId)) { - var writer = new BinaryWriter(stream); - writer.Write(false); + _mmfCloseSignals[1] = MemoryMappedFile.CreateOrOpen(TaskId, 1, MemoryMappedFileAccess.ReadWrite); } - - if (!string.IsNullOrWhiteSpace(TaskId)) + foreach (var mmf in _mmfCloseSignals) { - _taskIdMmf = MemoryMappedFile.CreateOrOpen(TaskId, 1, MemoryMappedFileAccess.ReadWrite); - using (MemoryMappedViewStream stream = _taskIdMmf.CreateViewStream()) + if (mmf != null) { - var writer = new BinaryWriter(stream); - writer.Write(false); + using (MemoryMappedViewStream stream = mmf.CreateViewStream()) + { + var writer = new BinaryWriter(stream); + writer.Write(false); + } } } } else { - _closeSignalFiles[0] = Path.Combine(Env.BaseDirectory, $"{Identity}_cl"); + _filecloseSignals[0] = Path.Combine(Env.BaseDirectory, $"{Identity}_cl"); if (!string.IsNullOrWhiteSpace(TaskId)) { - _closeSignalFiles[1] = Path.Combine(Env.BaseDirectory, $"{TaskId}_cl"); + _filecloseSignals[1] = Path.Combine(Env.BaseDirectory, $"{TaskId}_cl"); } - foreach (var closeSignal in _closeSignalFiles) + foreach (var closeSignal in _filecloseSignals) { if (File.Exists(closeSignal)) { @@ -1500,42 +1516,36 @@ private void CheckExitSignal() // MMF 暂时还不支持非WINDOWS操作系统 if (Env.IsWindows) { - CheckExitSignalByMmf(); + CheckMmfCloseSignals(); } else { - CheckExitSignalByFile(); + CheckFileCloseSignals(); } } - private void CheckExitSignalByMmf() + private void CheckMmfCloseSignals() { - using (MemoryMappedViewStream stream = _identityMmf.CreateViewStream()) - { - var reader = new BinaryReader(stream); - if (reader.ReadBoolean()) - { - Exit(); - return; - } - } - - if (_taskIdMmf != null) + foreach (var mmf in _mmfCloseSignals) { - using (MemoryMappedViewStream stream = _taskIdMmf.CreateViewStream()) + if (mmf != null) { - var reader = new BinaryReader(stream); - if (reader.ReadBoolean()) + using (MemoryMappedViewStream stream = mmf.CreateViewStream()) { - Exit(); + var reader = new BinaryReader(stream); + if (reader.ReadBoolean()) + { + Exit(); + return; + } } } } } - private void CheckExitSignalByFile() + private void CheckFileCloseSignals() { - if (File.Exists(_closeSignalFiles[0]) || File.Exists(_closeSignalFiles[1])) + if (File.Exists(_filecloseSignals[0]) || File.Exists(_filecloseSignals[1])) { Exit(); } diff --git a/src/DotnetSpider.Downloader.Test/DotnetSpider.Downloader.Test.csproj b/src/DotnetSpider.Downloader.Test/DotnetSpider.Downloader.Test.csproj index 464b6ad6e..51723309c 100644 --- a/src/DotnetSpider.Downloader.Test/DotnetSpider.Downloader.Test.csproj +++ b/src/DotnetSpider.Downloader.Test/DotnetSpider.Downloader.Test.csproj @@ -9,6 +9,7 @@ + diff --git a/src/DotnetSpider.Downloader.Test/HttpClientDownloaderTest.cs b/src/DotnetSpider.Downloader.Test/HttpClientDownloaderTest.cs index 95c1358ac..0a18826ee 100644 --- a/src/DotnetSpider.Downloader.Test/HttpClientDownloaderTest.cs +++ b/src/DotnetSpider.Downloader.Test/HttpClientDownloaderTest.cs @@ -1,151 +1,119 @@ -//using System.Collections.Generic; -//using DotnetSpider.Core.Downloader; -//using Xunit; -//using DotnetSpider.Core.Scheduler; -//using static DotnetSpider.Core.Test.SpiderTest; -//using DotnetSpider.Core.Pipeline; -//using System.Threading.Tasks; -//using DotnetSpider.Selector; -//#if NETSTANDARD -//using System.Text; -//#endif +using System.Collections.Generic; +using Xunit; +using System.Threading.Tasks; +using DotnetSpider.Downloader; +using DotnetSpider.Common; -//namespace DotnetSpider.Core.Test.Downloader -//{ -// public class HttpClientDownloaderTest -// { -// public HttpClientDownloaderTest() -// { -//#if NETSTANDARD -// Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); -//#endif +#if !NETFRAMEWORK +using System.Text; +#endif -// } +namespace DotnetSpider.Core.Test.Downloader +{ + public class HttpClientDownloaderTest + { + public HttpClientDownloaderTest() + { +#if !NETFRAMEWORK + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); +#endif -// /// -// /// 手动执行此测试脚本,运行结束后用netstat -ano 查看端口占用情况。只会占用一个就对了 -// /// -// [Fact(Skip = "Need person double check", DisplayName = "Ports")] -// public void Ports() -// { -// HttpClientDownloader downloader = new HttpClientDownloader(); -// DefaultSpider spider = new DefaultSpider("abcd", new Site { }); + } -// for (int i = 0; i < 100; i++) -// { -// var a = downloader.Download(new Request("http://www.163.com", null), spider).Result; -// } -// } + /// + /// 手动执行此测试脚本,运行结束后用netstat -ano 查看端口占用情况。只会占用一个就对了 + /// + [Fact(Skip = "Need person double check", DisplayName = "Ports")] + public void Ports() + { + HttpClientDownloader downloader = new HttpClientDownloader(); + var site = new Site(); -// [Fact(DisplayName = "DetectDownloadContent")] -// public void DetectDownloadContent() -// { -// HttpClientDownloader downloader = new HttpClientDownloader(); -// DefaultSpider spider = new DefaultSpider("abcd", new Site { }); + for (int i = 0; i < 100; i++) + { + var a = downloader.Download(new Request("http://www.163.com", null) { Site = site }); + } + } -// var a = downloader.Download(new Request("http://www.163.com", null), spider); -// Assert.Equal(ContentType.Html, spider.Site.ContentType); + [Fact(DisplayName = "DetectDownloadContent")] + public void DetectDownloadContent() + { + HttpClientDownloader downloader = new HttpClientDownloader(); + var site = new Site(); + var a = downloader.Download(new Request("http://www.163.com", null) { Site = site }); + Assert.Equal(ContentType.Html, site.ContentType); + site = new Site(); + HttpClientDownloader2 downloader2 = new HttpClientDownloader2(); + a = downloader2.Download(new Request("http://www.163.com", null) + { + Site = site + }); + Assert.Equal(ContentType.Json, site.ContentType); + } -// HttpClientDownloader2 downloader2 = new HttpClientDownloader2(); -// DefaultSpider spider2 = new DefaultSpider("abcd", new Site { }); -// a = downloader2.Download(new Request("http://www.163.com", null), spider2); -// Assert.Equal(ContentType.Json, spider2.Site.ContentType); -// } + //[Fact(DisplayName = "_404Url")] + //public void _404Url() + //{ + // if (!Env.IsWindows) + // { + // return; + // } + // var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, + // "abcd", + // new QueueDuplicateRemovedScheduler(), + // new TestPageProcessor()); + // spider.AddPipeline(new ConsolePipeline()); + // spider.SkipTargetUrlsWhenResultIsEmpty = false; + // spider.EmptySleepTime = 6000; + // spider.AddStartUrl("http://www.mlr.gov.cn/xwdt/jrxw/201707/t20170710_15242382.htm"); + // spider.Run(); + // Assert.Equal(5, spider.RetriedTimes.Value); + //} -// [Fact(DisplayName = "SetContentType")] -// public void SetContentType() -// { -// Site site1 = new Site -// { -// Headers = new Dictionary() -// { -// {"Content-Type","abcd" } -// } -// }; -// Site site2 = new Site -// { -// Headers = new Dictionary() -// { -// {"ContentType","abcd" } -// } -// }; -// var downloader = new HttpClientDownloader(); -// var a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site1)); + //[Fact(DisplayName = "_301Url")] + //public void _301Url() + //{ + // if (!Env.IsWindows) + // { + // return; + // } + // var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, + // "abcd", + // new QueueDuplicateRemovedScheduler(), + // new TestPageProcessor()); + // spider.AddPipeline(new ConsolePipeline()); + // spider.SkipTargetUrlsWhenResultIsEmpty = true; + // spider.Downloader = new HttpClientDownloader(); + // spider.EmptySleepTime = 6000; + // spider.AddStartUrl("https://tieba.baidu.com/f?kw=%E7%AE%80%E9%98%B3&ie=utf-8&pn=50"); + // spider.Run(); + // Assert.Equal(0, spider.RetriedTimes.Value); + //} -// a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site2)); -// } + class HttpClientDownloader2 : HttpClientDownloader + { + protected override Response DowloadContent(Request request) + { + var page = new Response(request) { Content = "{'a':'b'}" }; + DetectContentType(page, null); + return page; + } + } -// [Fact(DisplayName = "_404Url")] -// public void _404Url() -// { -// if (!Env.IsWindows) -// { -// return; -// } -// var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, -// "abcd", -// new QueueDuplicateRemovedScheduler(), -// new TestPageProcessor()); -// spider.AddPipeline(new ConsolePipeline()); -// spider.SkipTargetUrlsWhenResultIsEmpty = false; -// spider.EmptySleepTime = 6000; -// spider.AddStartUrl("http://www.mlr.gov.cn/xwdt/jrxw/201707/t20170710_15242382.htm"); -// spider.Run(); -// Assert.Equal(5, spider.RetriedTimes.Value); -// } - -// [Fact(DisplayName = "_301Url")] -// public void _301Url() -// { -// if (!Env.IsWindows) -// { -// return; -// } -// var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, -// "abcd", -// new QueueDuplicateRemovedScheduler(), -// new TestPageProcessor()); -// spider.AddPipeline(new ConsolePipeline()); -// spider.SkipTargetUrlsWhenResultIsEmpty = true; -// spider.Downloader = new HttpClientDownloader(); -// spider.EmptySleepTime = 6000; -// spider.AddStartUrl("https://tieba.baidu.com/f?kw=%E7%AE%80%E9%98%B3&ie=utf-8&pn=50"); -// spider.Run(); -// Assert.Equal(0, spider.RetriedTimes.Value); -// } - -// class HttpClientDownloader2 : HttpClientDownloader -// { -// protected override Task DowloadContent(Request request, ISpider spider) -// { -// var page = new Page(request) { Content = "{'a':'b'}" }; -// return Task.FromResult(page); -// } -// } - -// [Fact(DisplayName = "GetTargetUrlWhenRedirect")] -// public void GetTargetUrlWhenRedirect() -// { -// Site site = new Site -// { -// Headers = new Dictionary -// { -// { "User-Agent", "Chrome" } -// } -// }; -// var downloader = new HttpClientDownloader(); -// var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site)).Result; -// Assert.DoesNotContain("1231222221111123", page.TargetUrl); -// Assert.True(page.TargetUrl.Contains("www.jd.com/") || page.TargetUrl.Contains("global.jd.com")); -// } - -// [Fact(DisplayName = "SetTimeout")] -// public void SetTimeout() -// { -// HttpClientDownloader downloader = new HttpClientDownloader(); -// var entry = HttpClientDownloader.HttpClientPool.GetHttpClient("a"); -// downloader.PrepareHttpClient(entry, null); -// Assert.Equal(8, entry.Client.Timeout.TotalSeconds); -// } -// } -//} + [Fact(DisplayName = "GetTargetUrlWhenRedirect")] + public void GetTargetUrlWhenRedirect() + { + Site site = new Site + { + Headers = new Dictionary + { + { "User-Agent", "Chrome" } + } + }; + var downloader = new HttpClientDownloader(); + var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null) { Site = site }); + Assert.DoesNotContain("1231222221111123", page.TargetUrl); + Assert.True(page.TargetUrl.Contains("www.jd.com/") || page.TargetUrl.Contains("global.jd.com")); + } + } +} diff --git a/src/DotnetSpider.Downloader/BaseDownloader.cs b/src/DotnetSpider.Downloader/BaseDownloader.cs index a9953f1e9..20e5ba7fd 100644 --- a/src/DotnetSpider.Downloader/BaseDownloader.cs +++ b/src/DotnetSpider.Downloader/BaseDownloader.cs @@ -4,6 +4,7 @@ using System.Net; using System.Text; using DotnetSpider.Common; +using Newtonsoft.Json; namespace DotnetSpider.Downloader { @@ -28,6 +29,9 @@ public abstract class BaseDownloader : IDownloader /// protected readonly CookieContainer CookieContainer = new CookieContainer(); + public WebProxy FiddlerProxy { get; set; } = new WebProxy("http://127.0.0.1:8888"); + public bool UseFiddlerProxy { get; set; } = false; + /// /// 是否自动跳转 /// @@ -198,14 +202,30 @@ protected virtual void DetectContentType(Response response, string contentType) { if (response.Request.Site.ContentType == ContentType.Auto) { - if (contentType.Contains("json")) + if (!string.IsNullOrWhiteSpace(contentType)) { - response.ContentType = ContentType.Json; + if (contentType.Contains("json")) + { + response.ContentType = ContentType.Json; + } + else + { + response.ContentType = ContentType.Html; + } } else { - response.ContentType = ContentType.Html; + try + { + JsonConvert.DeserializeObject(response.Content); + response.ContentType = ContentType.Json; + } + catch + { + response.ContentType = ContentType.Html; + } } + response.Request.Site.ContentType = response.ContentType; } else { @@ -271,7 +291,7 @@ public virtual void Dispose() protected void EnsureSuccessStatusCode(HttpStatusCode code) { - if ((int)code >= 200 && ((int)code <= 299)) + if (((int)code >= 200 && ((int)code <= 299)) || ((int)code >= 300 && ((int)code <= 399))) { return; } diff --git a/src/DotnetSpider.Downloader/DotnetSpider.Downloader.csproj b/src/DotnetSpider.Downloader/DotnetSpider.Downloader.csproj index 020da06e5..f0c250fa6 100644 --- a/src/DotnetSpider.Downloader/DotnetSpider.Downloader.csproj +++ b/src/DotnetSpider.Downloader/DotnetSpider.Downloader.csproj @@ -3,7 +3,7 @@ netstandard2.0;net40;net45; true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Downloader Copyright 2018 Lewis Zou @@ -20,16 +20,11 @@ - - - - - diff --git a/src/DotnetSpider.Downloader/HttpClientDownloader.cs b/src/DotnetSpider.Downloader/HttpClientDownloader.cs index 07af437de..68f91f6ab 100644 --- a/src/DotnetSpider.Downloader/HttpClientDownloader.cs +++ b/src/DotnetSpider.Downloader/HttpClientDownloader.cs @@ -87,27 +87,38 @@ protected override Response DowloadContent(Request request) WebProxy proxy = null; try { - - if (HttpProxyPool.Instance != null) + if (UseFiddlerProxy) { - proxy = HttpProxyPool.Instance.GetProxy(); - if (proxy == null) + if (FiddlerProxy == null) { - throw new DownloaderException("No avaliable proxy."); + throw new DownloaderException("Fiddler proxy is null."); } else { - _clientObject = GetHttpClient($"{proxy.Address.ToString()}", AllowAutoRedirect, proxy); - httpResponseMessage = - NetworkCenter.Current.Execute("downloader", () => _clientObject.Client.SendAsync(httpRequestMessage).Result); + proxy = FiddlerProxy; } } else { - httpResponseMessage = - NetworkCenter.Current.Execute("downloader", () => Default.SendAsync(httpRequestMessage).Result); + if (HttpProxyPool.Instance != null) + { + proxy = HttpProxyPool.Instance.GetProxy(); + if (proxy == null) + { + throw new DownloaderException("No avaliable proxy."); + } + } + else + { + _clientObject = GetHttpClient("DEFAULT", AllowAutoRedirect, null); + } } + _clientObject = GetHttpClient(proxy == null ? "DEFAULT" : $"{proxy.Address.ToString()}", AllowAutoRedirect, proxy); + + httpResponseMessage = + NetworkCenter.Current.Execute("downloader", () => _clientObject.Client.SendAsync(httpRequestMessage).Result); + response.StatusCode = httpResponseMessage.StatusCode; EnsureSuccessStatusCode(response.StatusCode); response.TargetUrl = httpResponseMessage.RequestMessage.RequestUri.AbsoluteUri; @@ -209,6 +220,7 @@ private HttpClientObject GetHttpClient(string hash, bool allowAutoRedirect, IWeb { hash = string.Empty; } + Interlocked.Increment(ref _getHttpClientCount); if (_getHttpClientCount % 100 == 0) @@ -263,8 +275,7 @@ private void CleanupPool() private HttpRequestMessage GenerateHttpRequestMessage(Request request) { - HttpRequestMessage httpRequestMessage = new HttpRequestMessage(new System.Net.Http.HttpMethod(request.Method.ToString()), request.Url); - + HttpRequestMessage httpRequestMessage = new HttpRequestMessage(request.Method, request.Url); var userAgentHeader = "User-Agent"; httpRequestMessage.Headers.TryAddWithoutValidation(userAgentHeader, request.Site.Headers.ContainsKey(userAgentHeader) ? request.Site.Headers[userAgentHeader] : request.Site.UserAgent); @@ -297,7 +308,7 @@ private HttpRequestMessage GenerateHttpRequestMessage(Request request) } } - if (request.Method == Common.HttpMethod.Post) + if (request.Method == HttpMethod.Post) { var data = string.IsNullOrWhiteSpace(request.Site.EncodingName) ? Encoding.UTF8.GetBytes(request.Content) : Encoding.GetEncoding(request.Site.EncodingName).GetBytes(request.Content); httpRequestMessage.Content = new StreamContent(new MemoryStream(data)); diff --git a/src/DotnetSpider.Downloader/HttpWebRequestDownloader.cs b/src/DotnetSpider.Downloader/HttpWebRequestDownloader.cs index a968a91cf..9d314b24e 100644 --- a/src/DotnetSpider.Downloader/HttpWebRequestDownloader.cs +++ b/src/DotnetSpider.Downloader/HttpWebRequestDownloader.cs @@ -2,6 +2,7 @@ using System.IO; using System.Linq; using System.Net; +using System.Net.Http; using System.Net.Security; using System.Security.Cryptography.X509Certificates; using System.Text; @@ -21,9 +22,6 @@ public class HttpWebRequestDownloader : BaseDownloader private readonly int _timeout; private readonly bool _decodeHtml; - public WebProxy FiddlerProxy { get; set; } = new WebProxy("http://127.0.0.1:8888"); - public bool UseFiddlerProxy { get; set; } = false; - public HttpWebRequestDownloader(int timeout = 8000, bool decodeHtml = false) { _timeout = timeout; @@ -141,7 +139,7 @@ private HttpWebRequest GenerateHttpWebRequest(Request request) { var site = request.Site; var httpWebRequest = (HttpWebRequest)WebRequest.Create(request.Url); - httpWebRequest.Method = request.Method.ToString().ToUpper(); + httpWebRequest.Method = request.Method.Method; // Add user-agent var userAgentHeader = "User-Agent"; diff --git a/src/DotnetSpider.Extension.Test/DataHandlerTest.cs b/src/DotnetSpider.Extension.Test/DataHandlerTest.cs index a0409ad9b..5b9711ff1 100644 --- a/src/DotnetSpider.Extension.Test/DataHandlerTest.cs +++ b/src/DotnetSpider.Extension.Test/DataHandlerTest.cs @@ -25,7 +25,7 @@ public void Handle(ref dynamic data, Page page) [Fact(DisplayName = "HandleModel")] public void HandleModel() { - var model = new ModelDefine(); + var model = new ModelDefinition(); EntityProcessor processor = new EntityProcessor(null, null, new MyDataHandler()); processor.Process(new Page(new Request("http://www.abcd.com") { Site = new Site() }) @@ -41,7 +41,7 @@ public void HandleModel() [EntitySelector(Expression = "$.data[*]", Type = SelectorType.JsonPath)] private class Product { - [Field(Expression = "$.name", Type = SelectorType.JsonPath, Length = 100)] + [FieldSelector(Expression = "$.name", Type = SelectorType.JsonPath, Length = 100)] public string name { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Downloader/WebDriverDownloaderTest.cs b/src/DotnetSpider.Extension.Test/Downloader/WebDriverDownloaderTest.cs index 3a3c3b951..be3951f37 100644 --- a/src/DotnetSpider.Extension.Test/Downloader/WebDriverDownloaderTest.cs +++ b/src/DotnetSpider.Extension.Test/Downloader/WebDriverDownloaderTest.cs @@ -11,6 +11,7 @@ using DotnetSpider.Extraction; using DotnetSpider.Extraction.Model.Formatter; using DotnetSpider.Common; +using DotnetSpider.Extraction.Model; #if NETSTANDARD using System.Runtime.InteropServices; #endif @@ -64,7 +65,7 @@ public HeadlessSpider() : base("HeadlessSpider") { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Monitor = new LogMonitor(); Identity = "HeadlessSpider"; @@ -86,7 +87,7 @@ public WebDriverDownloaderSpider() : base("WebDriverDownloader") { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary { { "Keyword", word } }); @@ -100,39 +101,39 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] private class BaiduSearchEntry { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string PlainText { get; set; } - [Field(Expression = "today", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "today", Type = SelectorType.Enviroment)] public DateTime RunId { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/EntitySpiderTest.cs b/src/DotnetSpider.Extension.Test/EntitySpiderTest.cs index ededc1796..8eb06166e 100644 --- a/src/DotnetSpider.Extension.Test/EntitySpiderTest.cs +++ b/src/DotnetSpider.Extension.Test/EntitySpiderTest.cs @@ -12,6 +12,7 @@ using DotnetSpider.Common; using DotnetSpider.Extraction; using DotnetSpider.Extraction.Model.Formatter; +using DotnetSpider.Extraction.Model; namespace DotnetSpider.Extension.Test { @@ -134,7 +135,7 @@ public void EntitySpiderRunCorrect() [TableInfo("test", "table")] private class TestEntity { - [Field(Expression = ".")] + [FieldSelector(Expression = ".")] public string Name { get; set; } } @@ -144,7 +145,7 @@ private class ClearSchedulerSpider : EntitySpider { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Monitor = new LogMonitor(); Identity = Guid.NewGuid().ToString("N"); @@ -164,7 +165,7 @@ public BaiduSearchSpider(string guid) : base("BaiduSearch") _guid = guid; } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), @@ -179,36 +180,36 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class BaiduSearchEntry { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment, Length = 100)] public string Keyword { get; set; } - [Field(Expression = "guid", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "guid", Type = SelectorType.Enviroment, Length = 100)] public string Guid { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] @@ -222,7 +223,7 @@ private class CasSpider : EntitySpider { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Identity = Guid.NewGuid().ToString(); EmptySleepTime = 5000; @@ -234,18 +235,18 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "//div[@class='ztlb_ld_mainR_box01_list']/ul/li")] class ArticleSummary { - [Field(Expression = ".//a/@title")] + [FieldSelector(Expression = ".//a/@title")] public string Title { get; set; } [ToNext(Extras = new[] { "Title", "Url" })] - [Field(Expression = ".//a/@href")] + [FieldSelector(Expression = ".//a/@href")] public string Url { get; set; } } } private class MultiEntitySpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Site = new Site { @@ -266,7 +267,7 @@ protected override void MyInit(params string[] arguments) [TargetRequestSelector(Patterns = new[] { "http://www.163.com" })] public class NeteastEntity { - [Field(Expression = ".//title")] + [FieldSelector(Expression = ".//title")] public string Title { get; set; } } @@ -274,7 +275,7 @@ public class NeteastEntity [TargetRequestSelector(Patterns = new[] { "http://www.sohu.com" })] public class SohuEntity { - [Field(Expression = ".//title")] + [FieldSelector(Expression = ".//title")] public string Title { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/LogTest.cs b/src/DotnetSpider.Extension.Test/LogTest.cs index 84d066663..eb22a1921 100644 --- a/src/DotnetSpider.Extension.Test/LogTest.cs +++ b/src/DotnetSpider.Extension.Test/LogTest.cs @@ -85,7 +85,7 @@ class statusObj internal class TestPipeline : BasePipeline { - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { foreach (var resultItem in resultItems) { diff --git a/src/DotnetSpider.Extension.Test/Pipeline/EnvPipelineTest.cs b/src/DotnetSpider.Extension.Test/Pipeline/EnvPipelineTest.cs index 65c872c08..46d486bd2 100644 --- a/src/DotnetSpider.Extension.Test/Pipeline/EnvPipelineTest.cs +++ b/src/DotnetSpider.Extension.Test/Pipeline/EnvPipelineTest.cs @@ -19,7 +19,7 @@ private void PrepareGlobalFile(string name) [Fact(DisplayName = "EnvConfigSetEmpty")] public void EnvConfigSetEmpty() { - lock (Env.Version) + lock (Env.BaseDirectory) { var arguments1 = Startup.Parse("-s:DotnetSpider.Extension.Test.Pipeline.TestSpider2", "--tid:TestSpider", "-i:guid", "-a:", "-c:"); Startup.LoadConfiguration(arguments1.Config); @@ -32,7 +32,7 @@ public void EnvConfigSetEmpty() [Fact(DisplayName = "EnvSetGlobal1")] public void EnvSetGlobal1() { - lock (Env.Version) + lock (Env.BaseDirectory) { PrepareGlobalFile("app.global.1.config"); var args1 = new[] { "-s:DotnetSpider.Extension.Test.Pipeline.TestSpider2", "--tid:TestSpider", "-i:guid", "-a:", "-c:%GLOBAL%app.global.1.config" }; @@ -45,7 +45,7 @@ public void EnvSetGlobal1() [Fact(DisplayName = "EnvSetGlobal2")] public void EnvSetGlobal2() { - lock (Env.Version) + lock (Env.BaseDirectory) { PrepareGlobalFile("app.config"); var args1 = new[] { "-s:DotnetSpider.Extension.Test.Pipeline.TestSpider2", "--tid:TestSpider", "-i:guid", "-a:", "-c:%GLOBAL%" }; diff --git a/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityFilePipelineTest.cs b/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityFilePipelineTest.cs index 9cf4fe3e2..421e48ed8 100644 --- a/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityFilePipelineTest.cs +++ b/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityFilePipelineTest.cs @@ -41,7 +41,7 @@ public MySqlFileEntityPipelineSpider() : base("MySqlFileEntityPipelineSpider") { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { EmptySleepTime = 1000; var word = "可乐|雪碧"; @@ -54,7 +54,7 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class BaiduSearchEntry { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityPipelineTest.cs b/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityPipelineTest.cs index 6b7988f19..eebf8bf15 100644 --- a/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityPipelineTest.cs +++ b/src/DotnetSpider.Extension.Test/Pipeline/MySqlEntityPipelineTest.cs @@ -62,20 +62,21 @@ public virtual void DataTypes() var pipeline = new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(processor.Model.Identity, new Tuple>(processor.Model, new dynamic[] { - new Dictionary - { - { "int", "1"}, - { "bool", "1"}, - { "bigint", "11"}, - { "string", "aaa"}, - { "time", "2018-06-12"}, - { "float", "1"}, - { "double", "1"}, - { "string1", "abc"}, - { "string2", "abcdd"}, - { "decimal", "1"} - } + resultItems.AddOrUpdateResultItem(processor.Model.Identity, + new Tuple>(processor.Model, new[] { + new Dictionary + { + { "int", "1"}, + { "bool", "1"}, + { "bigint", "11"}, + { "string", "aaa"}, + { "time", "2018-06-12"}, + { "float", "1"}, + { "double", "1"}, + { "string1", "abc"}, + { "string2", "abcdd"}, + { "decimal", "1"} + } })); pipeline.Process(new ResultItems[] { resultItems }, spider.Logger, spider); @@ -137,12 +138,12 @@ public virtual void Update_AutoIncrementPrimaryKey() var pipeline = CreatePipeline(); // 2. Create ModelDefine - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); // 3. Create data var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new AutoIncrementPrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new AutoIncrementPrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" } @@ -156,7 +157,7 @@ public virtual void Update_AutoIncrementPrimaryKey() resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new AutoIncrementPrimaryKey { Id = 1, Category = "4C" } })); @@ -192,12 +193,12 @@ public virtual void Update_MutliPrimaryKey() var pipeline = CreatePipeline(); // 2. Create ModelDefine - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); // 3. Create data var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new MultiPrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new MultiPrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" } @@ -211,7 +212,7 @@ public virtual void Update_MutliPrimaryKey() resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new MultiPrimaryKey { Sku="111", Category = "4C", Name="Product 2" } })); @@ -244,12 +245,12 @@ public virtual void Insert_AutoIncrementPrimaryKey() var spider = new DefaultSpider(); var pipeline = CreatePipeline(); - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new AutoIncrementPrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new AutoIncrementPrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -284,12 +285,12 @@ public virtual void Insert_NonePrimaryKey() var spider = new DefaultSpider(); var pipeline = CreatePipeline(); - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new NonePrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new NonePrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -328,12 +329,12 @@ public virtual void Insert_AutoTimestamp() var spider = new DefaultSpider(); var pipeline = CreatePipeline(); - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new Timestamp { Sku = "110", Category = "3C", Name = "Product 1" }, new Timestamp { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -377,12 +378,12 @@ public virtual void Insert_NoneTimestamp() var pipeline = CreatePipeline(); pipeline.AutoTimestamp = false; - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new Timestamp { Sku = "110", Category = "3C", Name = "Product 1" }, new Timestamp { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -416,12 +417,12 @@ public virtual void Insert_MultiPrimaryKey() var spider = new DefaultSpider(); var pipeline = CreatePipeline(); - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new MultiPrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new MultiPrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -459,12 +460,12 @@ public virtual void Insert_InsertNewAndUpdateOld() var spider = new DefaultSpider(); var pipeline = CreatePipeline(); - var metadata = new ModelDefine(); + var metadata = new ModelDefinition(); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new MultiPrimaryKey { Sku = "110", Category = "3C", Name = "Product 1" }, new MultiPrimaryKey { Sku = "111", Category = "3C", Name = "Product 2" }, @@ -477,7 +478,7 @@ public virtual void Insert_InsertNewAndUpdateOld() resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] + resultItems.AddOrUpdateResultItem(metadata.Identity, new Tuple>(metadata, new dynamic[] { new AutoIncrementPrimaryKey { Sku = "110", Name="Product 1", Category = "4C" } })); @@ -502,52 +503,52 @@ public virtual void Insert_InsertNewAndUpdateOld() [TableInfo("test", "multiprimarykey", UpdateColumns = new[] { "Category" })] public class MultiPrimaryKey { - [Field(Expression = "category")] + [FieldSelector(Expression = "category")] public string Category { get; set; } - [Field(Expression = "name", IsPrimary = true, Length = 50)] + [FieldSelector(Expression = "name", IsPrimary = true, Length = 50)] public string Name { get; set; } - [Field(Expression = "sku", IsPrimary = true, Length = 50)] + [FieldSelector(Expression = "sku", IsPrimary = true, Length = 50)] public string Sku { get; set; } } [TableInfo("test", "timestamp")] public class Timestamp : BaseEntity { - [Field(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] public string Category { get; set; } - [Field(Expression = "name")] + [FieldSelector(Expression = "name")] public string Name { get; set; } - [Field(Expression = "sku", Length = 100)] + [FieldSelector(Expression = "sku", Length = 100)] public string Sku { get; set; } } [TableInfo("test", "autoincrementprimarykey", UpdateColumns = new[] { "Category" })] public class AutoIncrementPrimaryKey : BaseEntity { - [Field(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] public string Category { get; set; } - [Field(Expression = "name")] + [FieldSelector(Expression = "name")] public string Name { get; set; } - [Field(Expression = "sku", Length = 100)] + [FieldSelector(Expression = "sku", Length = 100)] public string Sku { get; set; } } [TableInfo("test", "noneprimarykey")] public class NonePrimaryKey { - [Field(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Category", Type = SelectorType.Enviroment, Length = 100)] public string Category { get; set; } - [Field(Expression = "name")] + [FieldSelector(Expression = "name")] public string Name { get; set; } - [Field(Expression = "sku", Length = 100)] + [FieldSelector(Expression = "sku", Length = 100)] public string Sku { get; set; } } @@ -565,34 +566,34 @@ public override string ToString() [TableInfo("test", "table15")] private class Entity15 { - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public int Int { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public bool Bool { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public long BigInt { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public string String { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public DateTime Time { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public float Float { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public double Double { get; set; } - [Field(Expression = "Url", Length = 100)] + [FieldSelector(Expression = "Url", Length = 100)] public string String1 { get; set; } - [Field(Expression = "Url", Length = 0)] + [FieldSelector(Expression = "Url", Length = 0)] public string String2 { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public decimal Decimal { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Pipeline/PipelineTest.cs b/src/DotnetSpider.Extension.Test/Pipeline/PipelineTest.cs index 1a98794d5..c7c51bd3d 100644 --- a/src/DotnetSpider.Extension.Test/Pipeline/PipelineTest.cs +++ b/src/DotnetSpider.Extension.Test/Pipeline/PipelineTest.cs @@ -12,6 +12,8 @@ using DotnetSpider.Extraction.Model.Formatter; using DotnetSpider.Extraction; using DotnetSpider.Common; +using DotnetSpider.Extraction.Model; +using DotnetSpider.Downloader; namespace DotnetSpider.Extension.Test.Pipeline { @@ -23,7 +25,7 @@ public BaiduSearchSpider() : base("BaiduSearchSpider") { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Monitor = new LogMonitor(); AddStartUrl("http://news.baidu.com/ns?word=可乐|雪碧&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", new Dictionary { { "Keyword", "可乐|雪碧" } }); @@ -42,7 +44,7 @@ protected override void Handle(Page page) class MyPipeline : BasePipeline { - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { } @@ -52,39 +54,39 @@ public override void Process(IEnumerable resultItems, ILogger logge [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class BaiduSearchEntry { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string PlainText { get; set; } - [Field(Expression = "today", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "today", Type = SelectorType.Enviroment)] public DateTime run_id { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Pipeline/SqlServerEntityPipelineTest.cs b/src/DotnetSpider.Extension.Test/Pipeline/SqlServerEntityPipelineTest.cs index 3988a5bbb..ea02a0908 100644 --- a/src/DotnetSpider.Extension.Test/Pipeline/SqlServerEntityPipelineTest.cs +++ b/src/DotnetSpider.Extension.Test/Pipeline/SqlServerEntityPipelineTest.cs @@ -56,7 +56,7 @@ public override void DataTypes() var pipeline = new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=master;Trusted_Connection=True;MultipleActiveResultSets=true"); var resultItems = new ResultItems(); resultItems.Request = new Request(); - resultItems.AddOrUpdateResultItem(processor.Model.Identity, new Tuple>(processor.Model, new dynamic[] { + resultItems.AddOrUpdateResultItem(processor.Model.Identity, new Tuple>(processor.Model, new dynamic[] { new Dictionary { { "int", "1"}, @@ -199,34 +199,34 @@ public override string ToString() [TableInfo("test", "table15")] private class Entity15 { - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public int Int { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public bool Bool { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public long BigInt { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public string String { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public DateTime Time { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public float Float { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public double Double { get; set; } - [Field(Expression = "Url", Length = 100)] + [FieldSelector(Expression = "Url", Length = 100)] public string String1 { get; set; } - [Field(Expression = "Url", Length = 0)] + [FieldSelector(Expression = "Url", Length = 0)] public string String2 { get; set; } - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public decimal Decimal { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Processor/EntityProcessorTest.cs b/src/DotnetSpider.Extension.Test/Processor/EntityProcessorTest.cs index adc917dd6..ed04e0314 100644 --- a/src/DotnetSpider.Extension.Test/Processor/EntityProcessorTest.cs +++ b/src/DotnetSpider.Extension.Test/Processor/EntityProcessorTest.cs @@ -11,35 +11,35 @@ namespace DotnetSpider.Extension.Test.Processor [EntitySelector(Expression = "$.data[*]", Type = SelectorType.JsonPath)] public class Entity1 { - [Field(Expression = "$.age", Type = SelectorType.JsonPath)] + [FieldSelector(Expression = "$.age", Type = SelectorType.JsonPath)] public int Age { get; set; } } [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity14 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]", "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity16 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&", @"&page=[0-1]+&" })] public class Entity17 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]", "//*[@id=\"2222\"]" }, Patterns = new[] { @"&page=[0-9]+&", @"&page=[0-1]+&" })] public class Entity18 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -47,7 +47,7 @@ public class Entity18 [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity22 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -55,7 +55,7 @@ public class Entity22 [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity19 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -63,7 +63,7 @@ public class Entity19 [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity20 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -71,14 +71,14 @@ public class Entity20 [TargetRequestSelector(XPaths = new[] { "//*[@id=\"1111\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Entity21 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new string[] { null }, Patterns = new string[] { @"&page=[0-9]+&" })] public class Entity25 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -87,7 +87,7 @@ public class EntityProcessorTest [Fact(DisplayName = "TargetRequestSelector_1Region_1Pattern")] public void TargetRequestSelector_1Region_1Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")); @@ -98,7 +98,7 @@ public void TargetRequestSelector_1Region_1Pattern() [Fact(DisplayName = "TargetRequestSelector_2Region_1Pattern")] public void TargetRequestSelector_2Region_1Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")); @@ -113,7 +113,7 @@ public void TargetRequestSelector_2Region_1Pattern() [Fact(DisplayName = "TargetRequestSelector_1Region_2Pattern")] public void TargetRequestSelector_1Region_2Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count); Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString()); @@ -124,7 +124,7 @@ public void TargetRequestSelector_1Region_2Pattern() [Fact(DisplayName = "TargetRequestSelector_2Region_2Pattern")] public void TargetRequestSelector_2Region_2Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count); Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")[0].ToString()); @@ -140,7 +140,7 @@ public void TargetRequestSelector_2Region_2Pattern() [Fact(DisplayName = "TargetRequestSelector_Multi_2Region_2Pattern")] public void TargetRequestSelector_Multi_2Region_2Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")); @@ -154,7 +154,7 @@ public void TargetRequestSelector_Multi_2Region_2Pattern() [Fact(DisplayName = "TargetRequestSelector_Multi_2SameRegion_2Pattern")] public void TargetRequestSelector_Multi_2SameRegion_2Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Equal(2, processor.GetTargetUrlPatterns("//*[@id=\"1111\"]").Count); @@ -167,7 +167,7 @@ public void TargetRequestSelector_Multi_2SameRegion_2Pattern() [Fact(DisplayName = "TargetRequestSelector_Multi_2SameRegion_2SamePattern")] public void TargetRequestSelector_Multi_2SameRegion_2SamePattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")); @@ -179,7 +179,7 @@ public void TargetRequestSelector_Multi_2SameRegion_2SamePattern() [Fact(DisplayName = "TargetRequestSelector_Multi_2Region_2SamePattern")] public void TargetRequestSelector_Multi_2Region_2SamePattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns("//*[@id=\"1111\"]")); @@ -194,7 +194,7 @@ public void TargetRequestSelector_Multi_2Region_2SamePattern() [Fact(DisplayName = "TargetRequestSelector_NullRegion_1Pattern")] public void TargetRequestSelector_NullRegion_1Pattern() { - new ModelDefine(); + new ModelDefinition(); var processor = new EntityProcessor(); Assert.Single(processor.GetTargetUrlPatterns(null)); Assert.Equal(@"&page=[0-9]+&", processor.GetTargetUrlPatterns(null)[0].ToString()); diff --git a/src/DotnetSpider.Extension.Test/Processor/ModelProcessorTest.cs b/src/DotnetSpider.Extension.Test/Processor/ModelProcessorTest.cs index 3dab51141..ba1a054a6 100644 --- a/src/DotnetSpider.Extension.Test/Processor/ModelProcessorTest.cs +++ b/src/DotnetSpider.Extension.Test/Processor/ModelProcessorTest.cs @@ -55,28 +55,28 @@ private Page CreatePage() private class N { - [Field(Expression = ".//div[@class='int']")] + [FieldSelector(Expression = ".//div[@class='int']")] public int Int { get; set; } - [Field(Expression = ".//div[@class='bool']")] + [FieldSelector(Expression = ".//div[@class='bool']")] public bool Bool { get; set; } - [Field(Expression = ".//div[@class='bigint']")] + [FieldSelector(Expression = ".//div[@class='bigint']")] public long BigInt { get; set; } - [Field(Expression = ".//div[@class='string']")] + [FieldSelector(Expression = ".//div[@class='string']")] public string String { get; set; } - [Field(Expression = ".//div[@class='datetime']")] + [FieldSelector(Expression = ".//div[@class='datetime']")] public DateTime DateTime { get; set; } - [Field(Expression = ".//div[@class='float']")] + [FieldSelector(Expression = ".//div[@class='float']")] public float Float { get; set; } - [Field(Expression = ".//div[@class='double']")] + [FieldSelector(Expression = ".//div[@class='double']")] public double Double { get; set; } - [Field(Expression = ".//div[@class='decimal']")] + [FieldSelector(Expression = ".//div[@class='decimal']")] public decimal Decimal { get; set; } } } diff --git a/src/DotnetSpider.Extension.Test/Processor/TargetUrlsExtractorTest.cs b/src/DotnetSpider.Extension.Test/Processor/TargetUrlsExtractorTest.cs index 5e5a067ac..2e6190b40 100644 --- a/src/DotnetSpider.Extension.Test/Processor/TargetUrlsExtractorTest.cs +++ b/src/DotnetSpider.Extension.Test/Processor/TargetUrlsExtractorTest.cs @@ -11,6 +11,8 @@ using DotnetSpider.Common; using DotnetSpider.Core.Processor.TargetRequestExtractors; using DotnetSpider.Extension.Processor; +using DotnetSpider.Extraction.Model; +using DotnetSpider.Downloader; namespace DotnetSpider.Extension.Test.Processor { @@ -19,21 +21,21 @@ public class TargetRequestExtractorTest [TargetRequestSelector()] public class Entity15 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new[] { "" }, Patterns = new[] { "" })] public class Entity23 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } [TargetRequestSelector(XPaths = new string[] { null }, Patterns = new string[] { null })] public class Entity24 { - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } } @@ -88,6 +90,7 @@ public void AutoIncrementTargetRequestExtractor_Test() { var id = Guid.NewGuid().ToString("N"); AutoIncrementTargetRequestExtractorSpider spider = new AutoIncrementTargetRequestExtractorSpider(id); + //spider.Downloader = new HttpClientDownloader { UseFiddlerProxy = true }; spider.Run(); var pipeline = spider.Pipelines.First() as CollectionEntityPipeline; var entities = pipeline.GetCollection("test.baidu_search"); @@ -108,36 +111,36 @@ private class AutoIncrementTargetRequestExtractorSpider : EntitySpider [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] private class BaiduSearchEntry { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment, Length = 100)] public string Keyword { get; set; } - [Field(Expression = "guid", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "guid", Type = SelectorType.Enviroment, Length = 100)] public string Guid { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] @@ -150,8 +153,16 @@ public AutoIncrementTargetRequestExtractorSpider(string guid) : base("BaiduSearc _guid = guid; } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { + Site = new Site + { + Headers = new Dictionary + { + { "Upgrade-Insecure-Requests", "1" } + }, + Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" + }; Monitor = new LogMonitor(); var word = "可乐|雪碧"; Identity = Guid.NewGuid().ToString(); diff --git a/src/DotnetSpider.Extension.Test/SpiderNameTest.cs b/src/DotnetSpider.Extension.Test/SpiderNameTest.cs index 415cede0f..8262d5567 100644 --- a/src/DotnetSpider.Extension.Test/SpiderNameTest.cs +++ b/src/DotnetSpider.Extension.Test/SpiderNameTest.cs @@ -13,7 +13,7 @@ public MySpider1() { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { } @@ -26,14 +26,14 @@ public MySpider2() } } - public class MySpider3 : CustomizedSpider + public class MySpider3 : Spider { public MySpider3() { Name = "MySpider3_1"; } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { } } @@ -45,7 +45,7 @@ public MySpider4() Name = "MySpider4_1"; } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { } } diff --git a/src/DotnetSpider.Extension/CustomizedSpider.cs b/src/DotnetSpider.Extension/CustomizedSpider.cs deleted file mode 100644 index b65bb8adf..000000000 --- a/src/DotnetSpider.Extension/CustomizedSpider.cs +++ /dev/null @@ -1,56 +0,0 @@ -using System.Linq; -using DotnetSpider.Common; -using DotnetSpider.Core; -using DotnetSpider.Downloader; - -namespace DotnetSpider.Extension -{ - public abstract class CustomizedSpider : Spider - { - /// - /// 构造方法 - /// - public CustomizedSpider() : this(new Site()) - { - } - - /// - /// 构造方法 - /// - /// 目标站点信息 - public CustomizedSpider(Site site) : base(site) - { - } - - /// - /// 自定义的初始化 - /// - /// 运行参数 - protected abstract void MyInit(params string[] arguments); - - /// - /// 运行爬虫 - /// - /// 运行参数 - protected override void Execute(params string[] arguments) - { - PrintInfo.Print(); - - if (arguments.Any(t => t?.ToLower() == SpiderArguments.Report)) - { - VerifyDataOrGenerateReport(arguments); - } - else - { - Logger.Information("Init custom component..."); - - NetworkCenter.Current.Execute("myInit", () => - { - MyInit(arguments); - }); - - base.Execute(arguments); - } - } - } -} diff --git a/src/DotnetSpider.Extension/DistributedSpider.cs b/src/DotnetSpider.Extension/DistributedSpider.cs index e24f1ff91..cc8a9ed96 100644 --- a/src/DotnetSpider.Extension/DistributedSpider.cs +++ b/src/DotnetSpider.Extension/DistributedSpider.cs @@ -10,7 +10,7 @@ namespace DotnetSpider.Extension /// /// 分布式爬虫 /// - public abstract class DistributedSpider : CustomizedSpider + public abstract class DistributedSpider : Spider { /// /// 验证结果保存到Redis中的Key @@ -41,8 +41,6 @@ public DistributedSpider(Site site) : base(site) /// 运行参数 protected override void Execute(params string[] arguments) { - PrintInfo.Print(); - RegisterControl(this); base.Execute(arguments); diff --git a/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj b/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj index 74193edc8..772a645af 100644 --- a/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj +++ b/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj @@ -1,10 +1,9 @@  - net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Extension Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Extension/Downloader/DownloadCache.cs b/src/DotnetSpider.Extension/Downloader/DownloadCache.cs index 49e132baf..15ce78515 100644 --- a/src/DotnetSpider.Extension/Downloader/DownloadCache.cs +++ b/src/DotnetSpider.Extension/Downloader/DownloadCache.cs @@ -1,4 +1,5 @@ using DotnetSpider.Extraction; +using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; namespace DotnetSpider.Extension.Downloader @@ -12,31 +13,31 @@ public class DownloadCache /// /// 所属爬虫的唯一标识 /// - [Field(Expression = "", Type = SelectorType.Enviroment, Length = 120)] + [FieldSelector(Expression = "", Type = SelectorType.Enviroment, Length = 120)] public string Identity { get; set; } /// /// 所属爬虫的任务编号 /// - [Field(Expression = "", Type = SelectorType.Enviroment, Length = 120)] + [FieldSelector(Expression = "", Type = SelectorType.Enviroment, Length = 120)] public string TaskId { get; set; } /// /// 所属爬虫的名称 /// - [Field(Expression = "", Type = SelectorType.Enviroment, Length = 120)] + [FieldSelector(Expression = "", Type = SelectorType.Enviroment, Length = 120)] public string Name { get; set; } /// /// 采集的链接 /// - [Field(Expression = "", Type = SelectorType.Enviroment, Length = 255)] + [FieldSelector(Expression = "", Type = SelectorType.Enviroment, Length = 255)] public string Url { get; set; } /// /// 下载的内容 /// - [Field(Expression = "", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "", Type = SelectorType.Enviroment)] public string Content { get; set; } } } diff --git a/src/DotnetSpider.Extension/Downloader/ManualWebDriverCookieInjector.cs b/src/DotnetSpider.Extension/Downloader/ManualWebDriverCookieInjector.cs index c37e2d5ad..62f7549c4 100644 --- a/src/DotnetSpider.Extension/Downloader/ManualWebDriverCookieInjector.cs +++ b/src/DotnetSpider.Extension/Downloader/ManualWebDriverCookieInjector.cs @@ -18,6 +18,7 @@ public class ManualWebDriverCookieInjector : WebDriverCookieInjector /// 构造方法 /// /// 登陆的链接 + /// 浏览器 /// 可控制程序 public ManualWebDriverCookieInjector(string url, Browser browser, IControllable controllable) : base(browser, controllable) { diff --git a/src/DotnetSpider.Extension/Downloader/WebDriverActions/Scroll.cs b/src/DotnetSpider.Extension/Downloader/WebDriverActions/Scroll.cs index e98aed742..1ba710618 100644 --- a/src/DotnetSpider.Extension/Downloader/WebDriverActions/Scroll.cs +++ b/src/DotnetSpider.Extension/Downloader/WebDriverActions/Scroll.cs @@ -1,8 +1,8 @@ -using OpenQA.Selenium.Remote; -using System; +using System; using System.Threading; +using OpenQA.Selenium.Remote; -namespace DotnetSpider.Extension.Downloader +namespace DotnetSpider.Extension.Downloader.WebDriverActions { /// /// 滚动操作的实现 diff --git a/src/DotnetSpider.Extension/Downloader/WebDriverDownloader.cs b/src/DotnetSpider.Extension/Downloader/WebDriverDownloader.cs index 3680b1f01..d0791e917 100644 --- a/src/DotnetSpider.Extension/Downloader/WebDriverDownloader.cs +++ b/src/DotnetSpider.Extension/Downloader/WebDriverDownloader.cs @@ -19,7 +19,6 @@ namespace DotnetSpider.Extension.Downloader /// public class WebDriverDownloader : BaseDownloader, IBeforeDownloadHandler { - private readonly object _locker = new object(); private IWebDriver _driver; private readonly int _driverWaitTime; private readonly Browser _browser; @@ -98,7 +97,7 @@ public override void AddCookie(System.Net.Cookie cookie) { base.AddCookie(cookie); // 如果 Downloader 在运行中, 需要把 Cookie 加到 Driver 中 - _driver?.Manage().Cookies.AddCookie(new OpenQA.Selenium.Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, null)); + _driver?.Manage().Cookies.AddCookie(new Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, null)); } public void Handle(ref Request request, IDownloader downloader) @@ -115,7 +114,7 @@ public void Handle(ref Request request, IDownloader downloader) foreach (System.Net.Cookie cookie in cookies) { // 此处不能通过直接调用AddCookie来添加, 会导致CookieContainer添加重复值 - _driver.Manage().Cookies.AddCookie(new OpenQA.Selenium.Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, null)); + _driver.Manage().Cookies.AddCookie(new Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, null)); } } } @@ -125,7 +124,6 @@ public void Handle(ref Request request, IDownloader downloader) [MethodImpl(MethodImplOptions.Synchronized)] protected override Common.Response DowloadContent(Request request) { - Site site = request.Site; try { NetworkCenter.Current.Execute("webdriver-download", () => diff --git a/src/DotnetSpider.Extension/Pipeline/CollectionEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/CollectionEntityPipeline.cs index d86ee6df5..1472d2332 100644 --- a/src/DotnetSpider.Extension/Pipeline/CollectionEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/CollectionEntityPipeline.cs @@ -18,7 +18,7 @@ public class CollectionEntityPipeline : ModelPipeline, ICollectionEntityPipeline /// /// 爬虫实体名称 /// 实体数据 - public IEnumerable GetCollection(string modeIdentity) + public IList GetCollection(string modeIdentity) { lock (_locker) { @@ -39,9 +39,9 @@ public IEnumerable GetCollection(string modeIdentity) /// 日志接口 /// 调用方 /// 最终影响结果数量(如数据库影响行数) - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null|| datas.Count == 0) { return 0; } @@ -60,7 +60,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger _collector.Add(model.Identity, list); } - return datas.Count(); + return datas.Count; } } } diff --git a/src/DotnetSpider.Extension/Pipeline/ConsoleEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/ConsoleEntityPipeline.cs index 3c19d364d..3fa2a115f 100644 --- a/src/DotnetSpider.Extension/Pipeline/ConsoleEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/ConsoleEntityPipeline.cs @@ -21,9 +21,9 @@ public class ConsoleEntityPipeline : ModelPipeline /// 日志接口 /// 调用方 /// 最终影响结果数量(如数据库影响行数) - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } @@ -32,7 +32,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger { Console.WriteLine($"Store: {JsonConvert.SerializeObject(data)}"); } - return datas.Count(); + return datas.Count; } } } diff --git a/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs b/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs index 56372820c..6e5098056 100644 --- a/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs @@ -9,7 +9,6 @@ using System; using System.Linq; using DotnetSpider.Extraction.Model; -using DotnetSpider.Extraction.Model.Attribute; using DotnetSpider.Common; namespace DotnetSpider.Extension.Pipeline @@ -52,14 +51,14 @@ protected DbModelPipeline(string connectString = null, PipelineMode pipelineMode protected abstract void InitDatabaseAndTable(IDbConnection conn, IModel model); - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } - if (model.TableInfo == null) + if (model.Table == null) { return 0; } @@ -73,7 +72,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger conn = RefreshConnectionString(logger); // 每天执行一次建表操作, 可以实现每天一个表的操作,或者按周分表可以在运行时创建新表。 - var key = model.TableInfo.Postfix != TableNamePostfix.None ? $"{model.Identity}_{DateTime.Now:yyyyMMdd}" : model.Identity; + var key = model.Table.Postfix != TableNamePostfix.None ? $"{model.Identity}_{DateTime.Now:yyyyMMdd}" : model.Identity; Sqls sqls; lock (this) { diff --git a/src/DotnetSpider.Extension/Pipeline/DefaultMySqlPipeline.cs b/src/DotnetSpider.Extension/Pipeline/DefaultMySqlPipeline.cs index 59512bad2..7a3787a2a 100644 --- a/src/DotnetSpider.Extension/Pipeline/DefaultMySqlPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/DefaultMySqlPipeline.cs @@ -59,8 +59,9 @@ public DefaultMySqlPipeline(string connectString, string database, string tableN /// 处理页面解析器解析到的数据结果 /// /// 数据结果 - /// 调用方 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic spider) + /// 日志接口 + /// 调用方 + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { var results = new List(); foreach (var resultItem in resultItems) diff --git a/src/DotnetSpider.Extension/Pipeline/ExcelEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/ExcelEntityPipeline.cs index 9efcf49ac..71247336f 100644 --- a/src/DotnetSpider.Extension/Pipeline/ExcelEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/ExcelEntityPipeline.cs @@ -52,10 +52,10 @@ public override void Dispose() } } - private void WriteToExcel(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + private void WriteToExcel(IModel model, IList datas, dynamic sender) { var excelPath = Path.Combine(Env.BaseDirectory, "excels", $"{sender.Name}_{sender.Identity}.xlsx"); - var sheetName = model.TableInfo.Name; + var sheetName = model.Table.Name; var sheetIndex = $"{excelPath}.{sheetName}"; if (!_packages.ContainsKey(excelPath)) @@ -115,15 +115,15 @@ private int IncreaseRowIndex(string sheet) /// 调用方 /// 最终影响结果数量(如数据库影响行数) [MethodImpl(MethodImplOptions.Synchronized)] - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } - WriteToExcel(model, datas, logger, sender); - return datas.Count(); + WriteToExcel(model, datas, sender); + return datas.Count; } } } \ No newline at end of file diff --git a/src/DotnetSpider.Extension/Pipeline/ICollectionEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/ICollectionEntityPipeline.cs index a7372c81b..db748bf01 100644 --- a/src/DotnetSpider.Extension/Pipeline/ICollectionEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/ICollectionEntityPipeline.cs @@ -12,6 +12,6 @@ public interface ICollectionEntityPipeline /// /// 实体名称 /// 数据结果 - IEnumerable GetCollection(string entityName); + IList GetCollection(string entityName); } } diff --git a/src/DotnetSpider.Extension/Pipeline/JsonFileEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/JsonFileEntityPipeline.cs index eab74dcaf..1537985bc 100644 --- a/src/DotnetSpider.Extension/Pipeline/JsonFileEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/JsonFileEntityPipeline.cs @@ -3,7 +3,6 @@ using System.Runtime.CompilerServices; using System.Text; using DotnetSpider.Core; -using System.Linq; using DotnetSpider.Extraction.Model; using DotnetSpider.Common; @@ -37,16 +36,17 @@ public override void Dispose() /// 调用方 /// 最终影响结果数量(如数据库影响行数) [MethodImpl(MethodImplOptions.Synchronized)] - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } StreamWriter writer; - var dataFolder = Path.Combine(Env.BaseDirectory, "json", sender.Identity); - var jsonFile = Path.Combine(dataFolder, $"{model.TableInfo.FullName}.json"); + var identity = GetIdentity(sender); + var dataFolder = Path.Combine(Env.BaseDirectory, "json", identity); + var jsonFile = Path.Combine(dataFolder, $"{model.Table.FullName}.json"); if (_writers.ContainsKey(jsonFile)) { writer = _writers[jsonFile]; @@ -65,7 +65,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger { writer.WriteLine(entry.ToString()); } - return datas.Count(); + return datas.Count; } } } diff --git a/src/DotnetSpider.Extension/Pipeline/ModelPipeline.cs b/src/DotnetSpider.Extension/Pipeline/ModelPipeline.cs index c88995013..257ec7ec6 100644 --- a/src/DotnetSpider.Extension/Pipeline/ModelPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/ModelPipeline.cs @@ -21,7 +21,7 @@ public abstract class ModelPipeline : BasePipeline /// 日志接口 /// 调用方 /// 最终影响结果数量(如数据库影响行数) - protected abstract int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender = null); + protected abstract int Process(IModel model, IList datas, ILogger logger, dynamic sender = null); /// /// 处理页面解析器解析到的数据结果 @@ -29,7 +29,7 @@ public abstract class ModelPipeline : BasePipeline /// 数据结果 /// 日志接口 /// 调用方 - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { if (resultItems == null) { @@ -40,11 +40,11 @@ public override void Process(IEnumerable resultItems, ILogger logge { foreach (var kv in resultItem.Results) { - var value = kv.Value as Tuple>; + var value = kv.Value as Tuple>; if (value?.Item2 != null && value.Item2.Any()) { - resultItem.Request.AddCountOfResults(value.Item2.Count()); + resultItem.Request.AddCountOfResults(value.Item2.Count); int effectedRows = Process(value.Item1, value.Item2, logger, sender); resultItem.Request.AddEffectedRows(effectedRows); } diff --git a/src/DotnetSpider.Extension/Pipeline/MongoDbEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/MongoDbEntityPipeline.cs index 0300730d9..ee58908a3 100644 --- a/src/DotnetSpider.Extension/Pipeline/MongoDbEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/MongoDbEntityPipeline.cs @@ -35,15 +35,15 @@ public MongoDbEntityPipeline(string connectString) /// 日志接口 /// 调用方 /// 最终影响结果数量(如数据库影响行数) - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } - var db = _client.GetDatabase(model.TableInfo.Database); - var collection = db.GetCollection(model.TableInfo.FullName); + var db = _client.GetDatabase(model.Table.Database); + var collection = db.GetCollection(model.Table.FullName); var action = new Action(() => { @@ -64,7 +64,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger { action(); } - return datas.Count(); + return datas.Count; } } } diff --git a/src/DotnetSpider.Extension/Pipeline/MySqlEntityFilePipeline.cs b/src/DotnetSpider.Extension/Pipeline/MySqlEntityFilePipeline.cs index d4e95804c..69377db1e 100644 --- a/src/DotnetSpider.Extension/Pipeline/MySqlEntityFilePipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/MySqlEntityFilePipeline.cs @@ -56,17 +56,18 @@ public MySqlEntityFilePipeline(FileType fileType = FileType.LoadFile) /// 调用方 /// 最终影响结果数量(如数据库影响行数) [MethodImpl(MethodImplOptions.Synchronized)] - protected override int Process(IModel model, IEnumerable datas, ILogger logger, dynamic sender) + protected override int Process(IModel model, IList datas, ILogger logger, dynamic sender = null) { - if (datas == null || datas.Count() == 0) + if (datas == null || datas.Count == 0) { return 0; } StreamWriter writer; - var tableName = model.TableInfo.FullName; - var dataFolder = Path.Combine(Env.BaseDirectory, "mysql", sender.Identity); - var mysqlFile = Path.Combine(dataFolder, $"{model.TableInfo.Database}.{tableName}.sql"); + var tableName = model.Table.FullName; + var identity = GetIdentity(sender); + var dataFolder = Path.Combine(Env.BaseDirectory, "mysql", identity); + var mysqlFile = Path.Combine(dataFolder, $"{model.Table.Database}.{tableName}.sql"); if (_writers.ContainsKey(mysqlFile)) { writer = _writers[mysqlFile]; @@ -96,7 +97,7 @@ protected override int Process(IModel model, IEnumerable datas, ILogger } } - return datas.Count(); + return datas.Count; } /// @@ -118,7 +119,7 @@ private void AppendInsertSqlFile(StreamWriter writer, IModel model, IEnumerable< foreach (var item in items) { //{Environment.NewLine} - builder.Append($"INSERT IGNORE INTO `{model.TableInfo.Database}`.`{model.TableInfo.FullName}` ("); + builder.Append($"INSERT IGNORE INTO `{model.Table.Database}`.`{model.Table.FullName}` ("); var lastColumn = model.Fields.Last(); foreach (var column in model.Fields) { diff --git a/src/DotnetSpider.Extension/Pipeline/MySqlEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/MySqlEntityPipeline.cs index ea31a9ca4..a1352504c 100644 --- a/src/DotnetSpider.Extension/Pipeline/MySqlEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/MySqlEntityPipeline.cs @@ -43,7 +43,7 @@ protected override Sqls GenerateSqls(IModel model, ILogger logger) protected override void InitDatabaseAndTable(IDbConnection conn, IModel model) { - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; conn.MyExecute($"CREATE SCHEMA IF NOT EXISTS `{database}` DEFAULT CHARACTER SET utf8mb4;"); conn.MyExecute(GenerateCreateTableSql(model)); } @@ -55,8 +55,8 @@ protected override void InitDatabaseAndTable(IDbConnection conn, IModel model) /// SQL语句 private string GenerateCreateTableSql(IModel model) { - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; var fields = model.Fields; @@ -89,9 +89,9 @@ private string GenerateCreateTableSql(IModel model) builder.Append($", PRIMARY KEY ({string.Join(", ", fields.Where(f => f.IsPrimary).Select(field => IgnoreColumnCase ? field.Name.ToLower() : field.Name))})"); } - if (model.TableInfo.Indexs != null) + if (model.Table.Indexs != null) { - foreach (var index in model.TableInfo.Indexs) + foreach (var index in model.Table.Indexs) { var columns = index.Split(','); string name = string.Join("_", columns.Select(c => c)); @@ -99,9 +99,9 @@ private string GenerateCreateTableSql(IModel model) builder.Append($", KEY `index_{name}` ({indexColumNames})"); } } - if (model.TableInfo.Uniques != null) + if (model.Table.Uniques != null) { - foreach (var unique in model.TableInfo.Uniques) + foreach (var unique in model.Table.Uniques) { var columns = unique.Split(','); string name = string.Join("_", columns.Select(c => c)); @@ -135,8 +135,8 @@ private string GenerateInsertSql(IModel model, bool ignoreDuplicate) columnsParamsSql = $"{columnsParamsSql}, NOW(), CURRENT_DATE()"; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; var sql = $"INSERT {(ignoreDuplicate ? "IGNORE" : "")} INTO `{database}`.`{tableName}` ({columnsSql}) VALUES ({columnsParamsSql});"; return sql; @@ -162,12 +162,12 @@ private string GenerateInsertNewAndUpdateOldSql(IModel model) columnsParamsSql = $"{columnsParamsSql}, NOW(), CURRENT_DATE()"; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; string setParams = string.Join(", ", insertColumns.Select(p => $"`{(IgnoreColumnCase ? p.Name.ToLower() : p.Name)}`=@{p.Name}")); var sql = - $"INSERT INTO `{model.TableInfo.Database}`.`{tableName}` ({columnsSql}) VALUES ({columnsParamsSql}) ON DUPLICATE KEY UPDATE {setParams};"; + $"INSERT INTO `{model.Table.Database}`.`{tableName}` ({columnsSql}) VALUES ({columnsParamsSql}) ON DUPLICATE KEY UPDATE {setParams};"; return sql; } @@ -175,9 +175,9 @@ private string GenerateInsertNewAndUpdateOldSql(IModel model) private string GenerateUpdateSql(IModel model, ILogger logger) { // 无主键, 无更新字段都无法生成更新SQL - if (model.TableInfo.UpdateColumns == null || !model.TableInfo.UpdateColumns.Any() || !model.Fields.Any(f => f.IsPrimary)) + if (model.Table.UpdateColumns == null || !model.Table.UpdateColumns.Any() || !model.Fields.Any(f => f.IsPrimary)) { - if (model.TableInfo.UpdateColumns == null || !model.TableInfo.UpdateColumns.Any()) + if (model.Table.UpdateColumns == null || !model.Table.UpdateColumns.Any()) { logger.Warning("Can't generate update sql, in table info, the count of update columns is zero."); } @@ -188,8 +188,8 @@ private string GenerateUpdateSql(IModel model, ILogger logger) return null; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; string where = ""; foreach (var field in model.Fields.Where(f => f.IsPrimary)) @@ -199,7 +199,7 @@ private string GenerateUpdateSql(IModel model, ILogger logger) } where = where.Substring(0, where.Length - 3); - string setCols = string.Join(", ", model.TableInfo.UpdateColumns.Select(p => $"`{p.ToLower()}`=@{p}")); + string setCols = string.Join(", ", model.Table.UpdateColumns.Select(p => $"`{p.ToLower()}`=@{p}")); var sql = $"UPDATE `{database}`.`{tableName}` SET {setCols} WHERE {where};"; return sql; } @@ -211,8 +211,8 @@ private string GenerateSelectSql(IModel model) return null; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; string where = ""; foreach (var field in model.Fields.Where(f => f.IsPrimary)) @@ -225,7 +225,7 @@ private string GenerateSelectSql(IModel model) return sql; } - private string GenerateColumn(Field field) + private string GenerateColumn(FieldSelector field) { var columnName = IgnoreColumnCase ? field.Name.ToLower() : field.Name; var dataType = GetDataTypeSql(field.DataType, field.Length); diff --git a/src/DotnetSpider.Extension/Pipeline/SqlServerEntityPipeline.cs b/src/DotnetSpider.Extension/Pipeline/SqlServerEntityPipeline.cs index abf360ef4..f1ececfc2 100644 --- a/src/DotnetSpider.Extension/Pipeline/SqlServerEntityPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/SqlServerEntityPipeline.cs @@ -32,11 +32,11 @@ private string GenerateCreateDatabaseSql(IModel model, string serverVersion) { case "11": { - return $"USE master; IF NOT EXISTS(SELECT * FROM sysdatabases WHERE name='{model.TableInfo.Database}') CREATE DATABASE {model.TableInfo.Database};"; + return $"USE master; IF NOT EXISTS(SELECT * FROM sysdatabases WHERE name='{model.Table.Database}') CREATE DATABASE {model.Table.Database};"; } default: { - return $"USE master; IF NOT EXISTS(SELECT * FROM sys.databases WHERE name='{model.TableInfo.Database}') CREATE DATABASE {model.TableInfo.Database};"; + return $"USE master; IF NOT EXISTS(SELECT * FROM sys.databases WHERE name='{model.Table.Database}') CREATE DATABASE {model.Table.Database};"; } } } @@ -48,19 +48,19 @@ private string GenerateIfDatabaseExistsSql(IModel model, string serverVersion) { case "11": { - return $"SELECT COUNT(*) FROM sysdatabases WHERE name='{model.TableInfo.Database}'"; + return $"SELECT COUNT(*) FROM sysdatabases WHERE name='{model.Table.Database}'"; } default: { - return $"SELECT COUNT(*) FROM sys.databases WHERE name='{model.TableInfo.Database}'"; + return $"SELECT COUNT(*) FROM sys.databases WHERE name='{model.Table.Database}'"; } } } private string GenerateCreateTableSql(IModel model) { - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; var fields = model.Fields; var singleAutoIncrementPrimary = fields.Count(f => f.IsPrimary && (f.DataType == DataType.Int || f.DataType == DataType.Long)) == 1; @@ -98,9 +98,9 @@ private string GenerateCreateTableSql(IModel model) builder.Append(") ON [PRIMARY];"); } - if (model.TableInfo.Indexs != null) + if (model.Table.Indexs != null) { - foreach (var index in model.TableInfo.Indexs) + foreach (var index in model.Table.Indexs) { var columns = index.Split(','); string name = string.Join("_", columns.Select(c => c)); @@ -109,9 +109,9 @@ private string GenerateCreateTableSql(IModel model) } } - if (model.TableInfo.Uniques != null) + if (model.Table.Uniques != null) { - foreach (var unique in model.TableInfo.Uniques) + foreach (var unique in model.Table.Uniques) { var columns = unique.Split(','); string name = string.Join("_", columns.Select(c => c)); @@ -123,7 +123,7 @@ private string GenerateCreateTableSql(IModel model) return sql; } - private string GenerateColumn(Field field) + private string GenerateColumn(FieldSelector field) { var columnName = IgnoreColumnCase ? field.Name.ToLower() : field.Name; var dataType = GetDataTypeSql(field.DataType, field.Length); @@ -158,8 +158,8 @@ private string GenerateInsertSql(IModel model) columnsParamsSql = $"{columnsParamsSql}, GETDATE(), GETDATE()"; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; var sql = $"USE {database}; INSERT INTO [{tableName}] ({columnsSql}) VALUES ({columnsParamsSql});"; return sql; @@ -168,9 +168,9 @@ private string GenerateInsertSql(IModel model) private string GenerateUpdateSql(IModel model, ILogger logger) { // 无主键, 无更新字段都无法生成更新SQL - if (model.TableInfo.UpdateColumns == null || !model.TableInfo.UpdateColumns.Any() || !model.Fields.Any(f => f.IsPrimary)) + if (model.Table.UpdateColumns == null || !model.Table.UpdateColumns.Any() || !model.Fields.Any(f => f.IsPrimary)) { - if (model.TableInfo.UpdateColumns == null || !model.TableInfo.UpdateColumns.Any()) + if (model.Table.UpdateColumns == null || !model.Table.UpdateColumns.Any()) { logger.Warning("Can't generate update sql, in table info, the count of update columns is zero."); } @@ -181,8 +181,8 @@ private string GenerateUpdateSql(IModel model, ILogger logger) return null; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; string where = ""; foreach (var field in model.Fields.Where(f => f.IsPrimary)) @@ -192,7 +192,7 @@ private string GenerateUpdateSql(IModel model, ILogger logger) } where = where.Substring(0, where.Length - 3); - string setCols = string.Join(", ", model.TableInfo.UpdateColumns.Select(p => $"[{p.ToLower()}]=@{p}")); + string setCols = string.Join(", ", model.Table.UpdateColumns.Select(p => $"[{p.ToLower()}]=@{p}")); var sql = $"USE [{database}]; UPDATE [{tableName}] SET {setCols} WHERE {where};"; return sql; } @@ -204,8 +204,8 @@ private string GenerateSelectSql(IModel model) return null; } - var tableName = IgnoreColumnCase ? model.TableInfo.FullName.ToLower() : model.TableInfo.FullName; - var database = IgnoreColumnCase ? model.TableInfo.Database.ToLower() : model.TableInfo.Database; + var tableName = IgnoreColumnCase ? model.Table.FullName.ToLower() : model.Table.FullName; + var database = IgnoreColumnCase ? model.Table.Database.ToLower() : model.Table.Database; string where = ""; foreach (var field in model.Fields.Where(f => f.IsPrimary)) diff --git a/src/DotnetSpider.Extension/Processor/EntityProcessor.cs b/src/DotnetSpider.Extension/Processor/EntityProcessor.cs index 9839bfff9..e21492340 100644 --- a/src/DotnetSpider.Extension/Processor/EntityProcessor.cs +++ b/src/DotnetSpider.Extension/Processor/EntityProcessor.cs @@ -16,7 +16,7 @@ namespace DotnetSpider.Extension.Processor /// 目标链接的解析、筛选器 /// 对解析的结果进一步加工操作 public EntityProcessor(IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null, params IDataHandler[] dataHandlers) - : base(new ModelDefine(), extractor ?? new ModelExtractor(), targetRequestExtractor, dataHandlers) + : base(new ModelDefinition(), extractor ?? new ModelExtractor(), targetRequestExtractor, dataHandlers) { } } diff --git a/src/DotnetSpider.Extension/Processor/ModelProcessor.cs b/src/DotnetSpider.Extension/Processor/ModelProcessor.cs index 860c82800..f369ce461 100644 --- a/src/DotnetSpider.Extension/Processor/ModelProcessor.cs +++ b/src/DotnetSpider.Extension/Processor/ModelProcessor.cs @@ -63,9 +63,9 @@ public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetReq return; } - if (Model.TargetUrlsSelectors != null && Model.TargetUrlsSelectors.Any()) + if (Model.TargetRequestSelectors != null && Model.TargetRequestSelectors.Any()) { - foreach (var targetUrlsSelector in Model.TargetUrlsSelectors) + foreach (var targetUrlsSelector in Model.TargetRequestSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); @@ -157,7 +157,7 @@ protected override void Handle(Page page) } } - page.AddResultItem(Model.Identity, new Tuple>(Model, items)); + page.AddResultItem(Model.Identity, new Tuple>(Model, items)); } } } \ No newline at end of file diff --git a/src/DotnetSpider.Extension/Processor/TargetRequestHandler.cs b/src/DotnetSpider.Extension/Processor/TargetRequestHandler.cs index e93a0fd14..b931d65f8 100644 --- a/src/DotnetSpider.Extension/Processor/TargetRequestHandler.cs +++ b/src/DotnetSpider.Extension/Processor/TargetRequestHandler.cs @@ -16,13 +16,13 @@ public class TargetRequestHandler : IBeforeProcessorHandler private readonly bool _extractByProcessor; /// - /// Construct a instance. + /// Construct a instance. /// /// /// 构造方法 /// - /// 目标链接解析器 - /// Processor是否还需要执行目标链接解析工作(Should continue to execute ) + /// 目标链接解析器 + /// Processor是否还需要执行目标链接解析工作(Should continue to execute ) public TargetRequestHandler(ITargetRequestExtractor targetRequestExtractor, bool extractByProcessor = false) { _targetUrlsExtractor = targetRequestExtractor ?? throw new ArgumentNullException(nameof(targetRequestExtractor)); @@ -30,7 +30,7 @@ public TargetRequestHandler(ITargetRequestExtractor targetRequestExtractor, bool } /// - /// Execute . + /// Execute . /// /// /// 执行目标链接解析器 diff --git a/src/DotnetSpider.Extension/SampleSpider.cs b/src/DotnetSpider.Extension/SampleSpider.cs index 53891c33c..c4070b876 100644 --- a/src/DotnetSpider.Extension/SampleSpider.cs +++ b/src/DotnetSpider.Extension/SampleSpider.cs @@ -12,20 +12,20 @@ public class SampleSpider { public static void Run() { - var site = new Site { }; - var mode = new ModelDefine + var site = new Site(); + var mode = new ModelDefinition ( new Selector(".//div[@class='result']"), new[] { - new Field("Keyword","Keyword",SelectorType.Enviroment), - new Field(".//h3[@class='c-title']/a","Title"), - new Field(".//h3[@class='c-title']/a/@href","Url"), - new Field(".//div/p[@class='c-author']/text()","Website"), - new Field(".//div/span/a[@class='c-cache']/@href","Snapshot"), - new Field(".//div[@class='c-summary c-row ']","Details"), - new Field(".","PlainText"), - new Field( "today","atime", SelectorType.Enviroment,DataType.Date) + new FieldSelector("Keyword","Keyword",SelectorType.Enviroment), + new FieldSelector(".//h3[@class='c-title']/a","Title"), + new FieldSelector(".//h3[@class='c-title']/a/@href","Url"), + new FieldSelector(".//div/p[@class='c-author']/text()","Website"), + new FieldSelector(".//div/span/a[@class='c-cache']/@href","Snapshot"), + new FieldSelector(".//div[@class='c-summary c-row ']","Details"), + new FieldSelector(".","PlainText"), + new FieldSelector( "today","atime", SelectorType.Enviroment,DataType.Date) } , new TableInfo("baidu", "search"), null); var processor = new ModelProcessor(mode); diff --git a/src/DotnetSpider.Extension/Scheduler/RedisScheduler.cs b/src/DotnetSpider.Extension/Scheduler/RedisScheduler.cs index a3e2db96e..155afc804 100644 --- a/src/DotnetSpider.Extension/Scheduler/RedisScheduler.cs +++ b/src/DotnetSpider.Extension/Scheduler/RedisScheduler.cs @@ -28,13 +28,10 @@ public class RedisScheduler : DuplicateRemovedScheduler, IDuplicateRemover private readonly string _itemKey; private readonly string _errorCountKey; private readonly string _successCountKey; - private readonly string _identityMd5; private readonly AutomicLong _successCounter = new AutomicLong(0); private readonly AutomicLong _errorCounter = new AutomicLong(0); - private readonly string _connectString; private RedisConnection _redisConnection; - private readonly string _identity; - private readonly Dictionary Cache = new Dictionary(); + private readonly Dictionary _cache = new Dictionary(); /// /// 批量加载时的每批次加载数 @@ -46,7 +43,7 @@ public class RedisScheduler : DuplicateRemovedScheduler, IDuplicateRemover /// /// RedisScheduler是否会使用互联网 /// - protected override bool UseInternet { get; set; } = true; + protected sealed override bool UseInternet { get; set; } = true; /// /// 构造方法 @@ -65,33 +62,31 @@ public RedisScheduler(string identity, string connectString) { if (string.IsNullOrWhiteSpace(identity)) { - throw new ArgumentNullException("identity should not be empty"); + throw new ArgumentNullException(nameof(identity)); } if (string.IsNullOrWhiteSpace(connectString)) { - throw new ArgumentNullException("connectString should not be empty"); + throw new ArgumentNullException(nameof(connectString)); } - _identity = identity; - _connectString = connectString; + + var s = connectString; DuplicateRemover = this; - var md5 = Cryptography.ToShortMd5(identity); + var md5 = identity.ToShortMd5(); _itemKey = $"dotnetspider:scheduler:{md5}:items"; _setKey = $"dotnetspider:scheduler:{md5}:set"; _queueKey = $"dotnetspider:scheduler:{md5}:queue"; _errorCountKey = $"dotnetspider:scheduler:{md5}:countOfFailures"; _successCountKey = $"dotnetspider:scheduler:{md5}:countOfSuccess"; - _identityMd5 = md5; - var action = new Action(() => { - if (!Cache.ContainsKey(_connectString)) + if (!_cache.ContainsKey(s)) { - _redisConnection = new RedisConnection(_connectString); - Cache.Add(_connectString, _redisConnection); + _redisConnection = new RedisConnection(s); + _cache.Add(s, _redisConnection); } - _redisConnection.Database.SortedSetAdd(TasksKey, _identity, (long)DateTimeUtil.GetCurrentUnixTimeNumber()); + _redisConnection.Database.SortedSetAdd(TasksKey, identity, (long)DateTimeUtil.GetCurrentUnixTimeNumber()); }); if (UseInternet) @@ -249,7 +244,7 @@ public override void Dispose() /// 批量导入 /// /// 请求对象 - public override void Reload(IEnumerable requests) + public override void Reload(ICollection requests) { var action = new Action(() => { diff --git a/src/DotnetSpider.Extraction.Test/EntityExtractorTest.cs b/src/DotnetSpider.Extraction.Test/EntityExtractorTest.cs index dee9fa109..3ee1b9650 100644 --- a/src/DotnetSpider.Extraction.Test/EntityExtractorTest.cs +++ b/src/DotnetSpider.Extraction.Test/EntityExtractorTest.cs @@ -16,7 +16,7 @@ public class EntityExtractorTest public void Extract() { ModelExtractor extractor = new ModelExtractor(); - var model = new ModelDefine(); + var model = new ModelDefinition(); var selectable = new Selectable( File.ReadAllText(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Jd.html")), "http://jd.com", ""); selectable.Properties = new Dictionary { @@ -45,34 +45,34 @@ private class Product public string AAA; private string bb; - [Field(Expression = "cat", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "cat", Type = SelectorType.Enviroment)] public string CategoryName { get; set; } - [Field(Expression = "cat3", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "cat3", Type = SelectorType.Enviroment)] public int CategoryId { get; set; } - [Field(Expression = "./div[1]/a/@href")] + [FieldSelector(Expression = "./div[1]/a/@href")] public string Url { get; set; } - [Field(Expression = "./@data-sku")] + [FieldSelector(Expression = "./@data-sku")] public string Sku { get; set; } - [Field(Expression = "./div[5]/strong/a")] + [FieldSelector(Expression = "./div[5]/strong/a")] public long CommentsCount { get; set; } - [Field(Expression = ".//div[@class='p-shop']/@data-shop_name")] + [FieldSelector(Expression = ".//div[@class='p-shop']/@data-shop_name")] public string ShopName { get; set; } - [Field(Expression = ".//div[@class='p-name']/a/em")] + [FieldSelector(Expression = ".//div[@class='p-name']/a/em")] public string Name { get; set; } - [Field(Expression = "./@venderid")] + [FieldSelector(Expression = "./@venderid")] public string VenderId { get; set; } - [Field(Expression = "./@jdzy_shop_id")] + [FieldSelector(Expression = "./@jdzy_shop_id")] public string JdzyShopId { get; set; } - [Field(Expression = "Today", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Today", Type = SelectorType.Enviroment)] public DateTime RunId { get; set; } } } diff --git a/src/DotnetSpider.Extraction.Test/FormatterTest.cs b/src/DotnetSpider.Extraction.Test/FormatterTest.cs index 4c54167ab..20541eeb1 100644 --- a/src/DotnetSpider.Extraction.Test/FormatterTest.cs +++ b/src/DotnetSpider.Extraction.Test/FormatterTest.cs @@ -17,7 +17,7 @@ public void RegexFormatterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("Pattern should not be null or empty", se.Message); } @@ -27,7 +27,7 @@ public void RegexFormatterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("Pattern should not be null or empty", se.Message); } @@ -37,7 +37,7 @@ public void RegexFormatterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("Pattern should not be null or empty", se.Message); } @@ -131,7 +131,7 @@ public void FormatStringFormaterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("FormatString should not be null or empty", se.Message); } @@ -141,7 +141,7 @@ public void FormatStringFormaterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("FormatString should not be null or empty", se.Message); } @@ -151,7 +151,7 @@ public void FormatStringFormaterTest() f.Formate(""); throw new Exception("TEST FAILED."); } - catch (ModelException se) + catch (ArgumentException se) { Assert.Equal("FormatString should not be null or empty", se.Message); } diff --git a/src/DotnetSpider.Extraction.Test/ModelDefineTest.cs b/src/DotnetSpider.Extraction.Test/ModelDefineTest.cs index d4aefe086..f7d5d291c 100644 --- a/src/DotnetSpider.Extraction.Test/ModelDefineTest.cs +++ b/src/DotnetSpider.Extraction.Test/ModelDefineTest.cs @@ -15,10 +15,10 @@ public void NotExistColumnAsIndex() { try { - var entityMetadata = new ModelDefine(); + var entityMetadata = new ModelDefinition(); throw new Exception("Test failed"); } - catch (ModelException exception) + catch (ArgumentException exception) { Assert.Equal("Columns set as index are not a property of your entity", exception.Message); } @@ -29,11 +29,11 @@ public void NotExistColumnAsUnique() { try { - var entityMetadata = new ModelDefine(); + var entityMetadata = new ModelDefinition(); throw new Exception("Test failed"); } - catch (ModelException exception) + catch (ArgumentException exception) { Assert.Equal("Columns set as unique are not a property of your entity", exception.Message); } @@ -42,7 +42,7 @@ public void NotExistColumnAsUnique() [Fact(DisplayName = "Formaters")] public void Formaters() { - var entity1 = new ModelDefine(); + var entity1 = new ModelDefinition(); var fields = entity1.Fields.ToArray(); var formatters = (fields[0]).Formatters; Assert.Equal(2, formatters.Length); @@ -56,7 +56,7 @@ public void ColumnOfIndexesOverLength() { try { - var entity = new ModelDefine(); + var entity = new ModelDefinition(); throw new Exception("Failed."); } @@ -71,7 +71,7 @@ public void ColumnOfUniqueOverLength() { try { - var entity = new ModelDefine(); + var entity = new ModelDefinition(); throw new Exception("Failed."); } @@ -84,38 +84,38 @@ public void ColumnOfUniqueOverLength() [Fact(DisplayName = "Indexes")] public void Indexes() { - var entity1 = new ModelDefine(); - Assert.Equal("Name3", entity1.TableInfo.Indexs[0]); - Assert.Equal(2, entity1.TableInfo.Uniques.Length); - Assert.Equal("Name,Name2", entity1.TableInfo.Uniques[0]); - Assert.Equal("Name2", entity1.TableInfo.Uniques[1]); + var entity1 = new ModelDefinition(); + Assert.Equal("Name3", entity1.Table.Indexs[0]); + Assert.Equal(2, entity1.Table.Uniques.Length); + Assert.Equal("Name,Name2", entity1.Table.Uniques[0]); + Assert.Equal("Name2", entity1.Table.Uniques[1]); } [Fact(DisplayName = "Schema")] public void Schema() { - var entityMetadata = new ModelDefine(); + var entityMetadata = new ModelDefinition(); - Assert.Equal("test", entityMetadata.TableInfo.Database); - Assert.Equal(TableNamePostfix.Monday, entityMetadata.TableInfo.Postfix); + Assert.Equal("test", entityMetadata.Table.Database); + Assert.Equal(TableNamePostfix.Monday, entityMetadata.Table.Postfix); - var entityMetadata1 = new ModelDefine(); - Assert.Null(entityMetadata1.TableInfo); + var entityMetadata1 = new ModelDefinition(); + Assert.Null(entityMetadata1.Table); } [Fact(DisplayName = "EntitySelector")] public void EntitySelector() { - var entity1 = new ModelDefine(); + var entity1 = new ModelDefinition(); Assert.Equal("expression", entity1.Selector.Expression); Assert.Equal(SelectorType.XPath, entity1.Selector.Type); - var entity2 = new ModelDefine(); + var entity2 = new ModelDefinition(); Assert.Equal("expression2", entity2.Selector.Expression); Assert.Equal(SelectorType.Css, entity2.Selector.Type); - var entity3 = new ModelDefine(); + var entity3 = new ModelDefinition(); Assert.Null(entity3.Selector); Assert.Equal("test.table", entity3.Identity); } @@ -125,10 +125,10 @@ public void NullModelSelector() { var fields = new[] { - new Field( "./div[1]/a/@href", "Url"), - new Field( "./@data-sku", "Sku"), + new FieldSelector( "./div[1]/a/@href", "Url"), + new FieldSelector( "./@data-sku", "Sku"), }; - ModelDefine model = new ModelDefine(null, fields); + ModelDefinition model = new ModelDefinition(null, fields); Assert.Null(model.Selector); Assert.Equal(2, model.Fields.Count); Assert.True(Guid.TryParse(model.Identity, out _)); @@ -143,10 +143,10 @@ public void NullTableInfo() var entitySelector = new Selector("//li[@class='gl-item']/div[contains(@class,'j-sku-item')]"); var fields = new[] { - new Field( "./div[1]/a/@href", "Url"), - new Field( "./@data-sku", "Sku"), + new FieldSelector( "./div[1]/a/@href", "Url"), + new FieldSelector( "./@data-sku", "Sku"), }; - ModelDefine model = new ModelDefine(entitySelector, fields); + ModelDefinition model = new ModelDefinition(entitySelector, fields); Assert.Equal("//li[@class='gl-item']/div[contains(@class,'j-sku-item')]", model.Selector.Expression); Assert.Equal(SelectorType.XPath, model.Selector.Type); Assert.Equal(2, model.Fields.Count); @@ -159,11 +159,11 @@ public void TableInfo() var entitySelector = new Selector("//li[@class='gl-item']/div[contains(@class,'j-sku-item')]"); var fields = new[] { - new Field( "./div[1]/a/@href", "Url"), - new Field( "./@data-sku", "Sku"), + new FieldSelector( "./div[1]/a/@href", "Url"), + new FieldSelector( "./@data-sku", "Sku"), }; var tableInfo = new TableInfo("db01", "tb1"); - ModelDefine model = new ModelDefine(entitySelector, fields, tableInfo); + ModelDefinition model = new ModelDefinition(entitySelector, fields, tableInfo); Assert.Equal(2, model.Fields.Count); Assert.Equal("db01.tb1", model.Identity); } @@ -171,7 +171,7 @@ public void TableInfo() [Fact(DisplayName = "NullTableInfoEntityModelDefine")] public void NullTableInfoEntityModelDefine() { - ModelDefine model = new ModelDefine(); + ModelDefinition model = new ModelDefinition(); Assert.Equal(2, model.Fields.Count); var field1 = model.Fields.First(); @@ -183,14 +183,14 @@ public void NullTableInfoEntityModelDefine() var field2 = model.Fields.ElementAt(1); Assert.Equal(10, field2.Length); - Assert.Null(model.TableInfo); + Assert.Null(model.Table); Assert.Equal("DotnetSpider.Extension.Test.Model.ModelDefineTest+NullTableInfoEntity", model.Identity); } [Fact(DisplayName = "TableInfoEntityModelDefine")] public void TableInfoEntityModelDefine() { - ModelDefine model = new ModelDefine(); + ModelDefinition model = new ModelDefinition(); Assert.Equal(2, model.Fields.Count); var field1 = model.Fields.First(); @@ -208,10 +208,10 @@ public void TableInfoEntityModelDefine() [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] private class NullTableInfoEntity { - [Field(Expression = "cat", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "cat", Type = SelectorType.Enviroment)] public string CategoryName { get; set; } - [Field(Expression = "./@jdzy_shop_id", Length = 10)] + [FieldSelector(Expression = "./@jdzy_shop_id", Length = 10)] public string JdzyShopId { get; set; } } @@ -219,10 +219,10 @@ private class NullTableInfoEntity [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] private class TableInfoEntity { - [Field(Expression = "cat", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "cat", Type = SelectorType.Enviroment)] public string CategoryName { get; set; } - [Field(Expression = "./@jdzy_shop_id", Length = 10)] + [FieldSelector(Expression = "./@jdzy_shop_id", Length = 10)] public string JdzyShopId { get; set; } } @@ -230,7 +230,7 @@ private class TableInfoEntity [EntitySelector(Expression = "expression")] private class Entity7 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Name { get; set; } } @@ -238,54 +238,54 @@ private class Entity7 [EntitySelector(Expression = "expression2", Type = SelectorType.Css)] private class Entity8 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Name { get; set; } } [TableInfo("test", "table")] private class Entity9 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Name { get; set; } } [TableInfo("test", "table", TableNamePostfix.Monday)] private class Entity4 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Name { get; set; } } private class Entity14 { - [Field(Expression = "Url")] + [FieldSelector(Expression = "Url")] public string Url { get; set; } } [TableInfo("test", "table", Indexs = new[] { "Name3" }, Uniques = new[] { "Name,Name2", "Name2" })] private class Entity10 { - [Field(Expression = "", Length = 100)] + [FieldSelector(Expression = "", Length = 100)] public string Name { get; set; } - [Field(Expression = "", Length = 100)] + [FieldSelector(Expression = "", Length = 100)] public string Name2 { get; set; } - [Field(Expression = "", Length = 100)] + [FieldSelector(Expression = "", Length = 100)] public string Name3 { get; set; } } [TableInfo("test", "table", Uniques = new[] { "c1" })] private class Entity18 { - [Field(Expression = "", Length = 300)] + [FieldSelector(Expression = "", Length = 300)] public string c1 { get; set; } } [TableInfo("test", "table", Indexs = new[] { "c1" })] private class Entity19 { - [Field(Expression = "", Length = 300)] + [FieldSelector(Expression = "", Length = 300)] public string c1 { get; set; } } @@ -294,7 +294,7 @@ private class Entity11 { [ReplaceFormatter(NewValue = "a", OldValue = "b")] [RegexFormatter(Pattern = "a(*)")] - [Field(Expression = "Name")] + [FieldSelector(Expression = "Name")] public string Name { get; set; } } @@ -302,14 +302,14 @@ private class Entity11 [TableInfo("test", "table", Uniques = new[] { "c1" })] private class Entity3 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Url { get; set; } } [TableInfo("test", "table", Indexs = new[] { "c1" })] private class Entity2 { - [Field(Expression = "")] + [FieldSelector(Expression = "")] public string Url { get; set; } } } diff --git a/src/DotnetSpider.Extraction.Test/ModelExtractorTest.cs b/src/DotnetSpider.Extraction.Test/ModelExtractorTest.cs index 3e6da2a45..e0e3b75b2 100644 --- a/src/DotnetSpider.Extraction.Test/ModelExtractorTest.cs +++ b/src/DotnetSpider.Extraction.Test/ModelExtractorTest.cs @@ -24,10 +24,10 @@ public void NullModelSelector() { var fields = new[] { - new Field( "./div[@class='title']", "title"), - new Field( "./div[@class='dotnetspider']", "dotnetspider"), + new FieldSelector( "./div[@class='title']", "title"), + new FieldSelector( "./div[@class='dotnetspider']", "dotnetspider"), }; - ModelDefine model = new ModelDefine(null, fields); + var model = new ModelDefinition(null, fields); ModelExtractor extractor = new ModelExtractor(); var result = extractor.Extract(CreatePage(), model).First() as Dictionary; @@ -42,9 +42,9 @@ public void ModelSelector() var entitySelector = new Selector("//div[@class='aaaa']"); var fields = new[] { - new Field( ".", "title"), + new FieldSelector( ".", "title"), }; - ModelDefine model = new ModelDefine(entitySelector, fields); + var model = new ModelDefinition(entitySelector, fields); ModelExtractor extractor = new ModelExtractor(); var results = extractor.Extract(CreatePage(), model).ToList(); @@ -58,7 +58,7 @@ public void ModelSelector() public void NullEntityModelSelector() { ModelExtractor extractor = new ModelExtractor(); - IModel model = new ModelDefine(); + IModel model = new ModelDefinition(); var result = extractor.Extract(CreatePage(), model).First() as Dictionary; @@ -70,7 +70,7 @@ public void NullEntityModelSelector() public void EntityModelSelector() { ModelExtractor extractor = new ModelExtractor(); - IModel model = new ModelDefine(); + IModel model = new ModelDefinition(); var results = extractor.Extract(CreatePage(), model).ToList(); Assert.Equal(2, results.Count()); @@ -81,17 +81,17 @@ public void EntityModelSelector() private class N { - [Field(Expression = "./div[@class='title']")] + [FieldSelector(Expression = "./div[@class='title']")] public string title { get; set; } - [Field(Expression = "./div[@class='dotnetspider']")] + [FieldSelector(Expression = "./div[@class='dotnetspider']")] public string dotnetspider { get; set; } } [EntitySelector(Expression = "//div[@class='aaaa']")] private class E { - [Field(Expression = ".")] + [FieldSelector(Expression = ".")] public string title { get; set; } } diff --git a/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj b/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj index 8267a1820..3ae7755ba 100644 --- a/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj +++ b/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Extraction Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Extraction/Model/Attribute/Field.cs b/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs similarity index 70% rename from src/DotnetSpider.Extraction/Model/Attribute/Field.cs rename to src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs index 355d052d8..e9663a09f 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/Field.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs @@ -2,61 +2,16 @@ namespace DotnetSpider.Extraction.Model.Attribute { - public enum DataType - { - None, - Int, - Float, - Double, - DateTime, - Date, - Long, - Bool, - String, - Decimal - } - - /// - /// 额外选项的定义 - /// - public enum FieldOptions - { - /// - /// 不作任何操作 - /// - None, - - /// - /// For html contene - /// - OuterHtml, - - /// - /// For html contene - /// - InnerHtml, - - /// - /// For html contene - /// - InnerText, - - /// - /// 取的查询器结果的个数作为结果 - /// - Count - } - /// /// 属性选择器的定义 /// [AttributeUsage(AttributeTargets.Property)] - public class Field : Selector + public class FieldSelector : Selector { /// /// 构造方法 /// - public Field() + public FieldSelector() { } @@ -68,7 +23,7 @@ public Field() /// 表达式 /// 数据类型 /// 类型长度 - public Field(string expression, string name, SelectorType type = SelectorType.XPath, DataType dataType = DataType.String, int length = 255) + public FieldSelector(string expression, string name, SelectorType type = SelectorType.XPath, DataType dataType = DataType.String, int length = 255) : base(expression, type) { Name = name; diff --git a/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs b/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs index 74edfde2a..1bdac794c 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs @@ -2,36 +2,6 @@ namespace DotnetSpider.Extraction.Model.Attribute { - public enum TableNamePostfix - { - None, - - /// - /// 表名的后缀为星期一的时间 - /// - Monday, - - /// - /// 表名的后缀为今天的时间 {name}_20171212 - /// - Today, - - /// - /// 表名的后缀为当月的第一天 {name}_20171201 - /// - FirstDayOfTheMonth, - - /// - /// 表名的后缀为当月 {name}_201712 - /// - Month, - - /// - /// 表名的后缀为上个月 {name}_201711 - /// - LastMonth - } - /// /// 爬虫实体类对应的表信息 /// diff --git a/src/DotnetSpider.Extraction/Model/BaseEntity.cs b/src/DotnetSpider.Extraction/Model/BaseEntity.cs index 4abf7b0df..c70d2f5ff 100644 --- a/src/DotnetSpider.Extraction/Model/BaseEntity.cs +++ b/src/DotnetSpider.Extraction/Model/BaseEntity.cs @@ -4,7 +4,7 @@ namespace DotnetSpider.Extraction.Model { public class BaseEntity { - [Field(DataType = DataType.Long, IsPrimary = true, Expression = "Id", Type = SelectorType.Enviroment)] + [FieldSelector(DataType = DataType.Long, IsPrimary = true, Expression = "Id", Type = SelectorType.Enviroment)] // ReSharper disable once UnusedAutoPropertyAccessor.Global public long Id { get; set; } } diff --git a/src/DotnetSpider.Extraction/Model/DataType.cs b/src/DotnetSpider.Extraction/Model/DataType.cs new file mode 100644 index 000000000..eaf1068c4 --- /dev/null +++ b/src/DotnetSpider.Extraction/Model/DataType.cs @@ -0,0 +1,21 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace DotnetSpider.Extraction.Model +{ + public enum DataType + { + None, + Int, + Float, + Double, + DateTime, + Date, + Long, + Bool, + String, + Decimal + } +} diff --git a/src/DotnetSpider.Extraction/Model/FieldOptions.cs b/src/DotnetSpider.Extraction/Model/FieldOptions.cs new file mode 100644 index 000000000..899f463a8 --- /dev/null +++ b/src/DotnetSpider.Extraction/Model/FieldOptions.cs @@ -0,0 +1,38 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace DotnetSpider.Extraction.Model +{ + /// + /// 额外选项的定义 + /// + public enum FieldOptions + { + /// + /// 不作任何操作 + /// + None, + + /// + /// For html contene + /// + OuterHtml, + + /// + /// For html contene + /// + InnerHtml, + + /// + /// For html contene + /// + InnerText, + + /// + /// 取的查询器结果的个数作为结果 + /// + Count + } +} diff --git a/src/DotnetSpider.Extraction/Model/Formatter/RegexAppendFormatter.cs b/src/DotnetSpider.Extraction/Model/Formatter/RegexAppendFormatter.cs index 44d3e047c..218dc3346 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/RegexAppendFormatter.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/RegexAppendFormatter.cs @@ -37,12 +37,12 @@ protected override void CheckArguments() { if (string.IsNullOrWhiteSpace(Pattern)) { - throw new ModelException("Pattern should not be null or empty"); + throw new ArgumentException("Pattern should not be null or empty"); } if (string.IsNullOrWhiteSpace(AppendValue)) { - throw new ModelException("Append should not be null or empty"); + throw new ArgumentException("Append should not be null or empty"); } } } diff --git a/src/DotnetSpider.Extraction/Model/Formatter/RegexFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/RegexFormater.cs index 763b78c06..080d3649f 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/RegexFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/RegexFormater.cs @@ -79,7 +79,7 @@ protected override void CheckArguments() { if (string.IsNullOrWhiteSpace(Pattern)) { - throw new ModelException("Pattern should not be null or empty"); + throw new ArgumentException("Pattern should not be null or empty"); } } } diff --git a/src/DotnetSpider.Extraction/Model/Formatter/RegexReplaceFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/RegexReplaceFormater.cs index 0b3c8496e..aab202729 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/RegexReplaceFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/RegexReplaceFormater.cs @@ -36,7 +36,7 @@ protected override void CheckArguments() { if (string.IsNullOrWhiteSpace(Pattern)) { - throw new ModelException("Pattern should not be null or empty"); + throw new ArgumentException("Pattern should not be null or empty"); } } } diff --git a/src/DotnetSpider.Extraction/Model/Formatter/SplitFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/SplitFormater.cs index af4691249..a41af93c8 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/SplitFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/SplitFormater.cs @@ -45,12 +45,12 @@ protected override void CheckArguments() { if (Splitor == null || Splitor.Length == 0) { - throw new ModelException("Splitors should not be null or empty"); + throw new ArgumentException("Splitors should not be null or empty"); } if (ElementAt < 0) { - throw new ModelException("ElementAt should larger than 0"); + throw new ArgumentException("ElementAt should larger than 0"); } } } diff --git a/src/DotnetSpider.Extraction/Model/Formatter/StringFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/StringFormater.cs index 0dbe9d086..1ec27a94a 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/StringFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/StringFormater.cs @@ -30,7 +30,7 @@ protected override void CheckArguments() { if (string.IsNullOrWhiteSpace(Format)) { - throw new ModelException("FormatString should not be null or empty"); + throw new ArgumentException("FormatString should not be null or empty"); } } } diff --git a/src/DotnetSpider.Extraction/Model/Formatter/TimeStampFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/TimeStampFormater.cs index 679de7eeb..93c70a8a2 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/TimeStampFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/TimeStampFormater.cs @@ -36,7 +36,7 @@ protected override object FormateValue(object value) } default: { - throw new ModelException("Wrong input timestamp"); + throw new ArgumentException("Wrong input timestamp"); } } return dt.ToString("yyyy-MM-dd HH:mm:ss"); diff --git a/src/DotnetSpider.Extraction/Model/Formatter/UrlEncodeFormater.cs b/src/DotnetSpider.Extraction/Model/Formatter/UrlEncodeFormater.cs index 11dd2c50c..22ed80bfc 100644 --- a/src/DotnetSpider.Extraction/Model/Formatter/UrlEncodeFormater.cs +++ b/src/DotnetSpider.Extraction/Model/Formatter/UrlEncodeFormater.cs @@ -41,7 +41,7 @@ protected override void CheckArguments() var encoding = System.Text.Encoding.GetEncoding(Encoding); if (encoding == null) { - throw new ModelException($"Can't get encoding: {Encoding}"); + throw new ArgumentException($"Can't get encoding: {Encoding}"); } } } diff --git a/src/DotnetSpider.Extraction/Model/IEntityExtractor.cs b/src/DotnetSpider.Extraction/Model/IEntityExtractor.cs index ba4c36fb9..babf40223 100644 --- a/src/DotnetSpider.Extraction/Model/IEntityExtractor.cs +++ b/src/DotnetSpider.Extraction/Model/IEntityExtractor.cs @@ -10,6 +10,6 @@ public interface IModelExtractor /// 可查询对象 /// 解析模型 /// 实体对象 - IEnumerable Extract(Selectable selectable, IModel model); + IList Extract(Selectable selectable, IModel model); } } diff --git a/src/DotnetSpider.Extraction/Model/IModel.cs b/src/DotnetSpider.Extraction/Model/IModel.cs index c15709d59..5280718db 100644 --- a/src/DotnetSpider.Extraction/Model/IModel.cs +++ b/src/DotnetSpider.Extraction/Model/IModel.cs @@ -27,17 +27,17 @@ public interface IModel /// 爬虫实体对应的数据库表信息 /// 允许 TableInfo 为空, 有可能是临时数据并不需要落库的 /// - TableInfo TableInfo { get; } + TableInfo Table { get; } /// /// 爬虫实体定义的数据库列信息 /// - HashSet Fields { get; } + HashSet Fields { get; } /// /// 目标链接的选择器 /// - IEnumerable TargetUrlsSelectors { get; } + IEnumerable TargetRequestSelectors { get; } /// /// 共享值的选择器 diff --git a/src/DotnetSpider.Extraction/Model/ModelDefine.cs b/src/DotnetSpider.Extraction/Model/ModelDefinition.cs similarity index 72% rename from src/DotnetSpider.Extraction/Model/ModelDefine.cs rename to src/DotnetSpider.Extraction/Model/ModelDefinition.cs index 9508f4856..197477c61 100644 --- a/src/DotnetSpider.Extraction/Model/ModelDefine.cs +++ b/src/DotnetSpider.Extraction/Model/ModelDefinition.cs @@ -6,7 +6,7 @@ namespace DotnetSpider.Extraction.Model { - public class ModelDefine : IModel + public class ModelDefinition : IModel { /// /// 数据模型的选择器 @@ -26,17 +26,17 @@ public class ModelDefine : IModel /// /// 爬虫实体对应的数据库表信息 /// - public TableInfo TableInfo { get; protected set; } + public TableInfo Table { get; protected set; } /// /// 爬虫实体定义的数据库列信息 /// - public HashSet Fields { get; protected set; } + public HashSet Fields { get; protected set; } /// /// 目标链接的选择器 /// - public IEnumerable TargetUrlsSelectors { get; protected set; } + public IEnumerable TargetRequestSelectors { get; protected set; } /// /// 共享值的选择器 @@ -45,44 +45,44 @@ public class ModelDefine : IModel public string Identity { get; protected set; } - public ModelDefine(Selector selector, IEnumerable fields, TableInfo table, - TargetRequestSelector targetUrlsSelector) - : this(selector, fields, table, new[] { targetUrlsSelector }) + public ModelDefinition(Selector selector, IEnumerable fields, TableInfo table, + TargetRequestSelector targetRequestSelector) + : this(selector, fields, table, new[] { targetRequestSelector }) { } - public ModelDefine(Selector selector, IEnumerable fields, TableInfo table = null, - IEnumerable targetUrlsSelectors = null, + public ModelDefinition(Selector selector, IEnumerable fields, TableInfo table = null, + IEnumerable targetRequestSelectors = null, IEnumerable sharedValueSelectors = null, int take = 0, bool takeFromHead = true) : this() { Selector = selector; - TableInfo = table; + Table = table; if (fields == null) { - throw new ModelException($"{nameof(fields)} should not be null."); + throw new ExtractionException($"{nameof(fields)} should not be null."); } - Fields = new HashSet(fields); + Fields = new HashSet(fields); if (Fields.Count == 0) { - throw new ModelException("Count of fields should large than 0."); + throw new ExtractionException("Count of fields should large than 0."); } - TargetUrlsSelectors = targetUrlsSelectors; + TargetRequestSelectors = targetRequestSelectors; SharedValueSelectors = sharedValueSelectors; Take = take; TakeFromHead = takeFromHead; - Identity = TableInfo == null ? Guid.NewGuid().ToString("N") : $"{TableInfo.Database}.{TableInfo.FullName}"; + Identity = Table == null ? Guid.NewGuid().ToString("N") : $"{Table.Database}.{Table.FullName}"; } - protected ModelDefine() + protected ModelDefinition() { } } - public class ModelDefine : ModelDefine + public class ModelDefinition : ModelDefinition { - public ModelDefine() + public ModelDefinition() { var type = typeof(T); @@ -128,10 +128,10 @@ public ModelDefine() var properties = type.GetProperties(BindingFlags.Instance | BindingFlags.Public); - var fields = new HashSet(); + var fields = new HashSet(); foreach (var property in properties) { - var field = property.GetCustomAttributes(typeof(Field), true).FirstOrDefault() as Field; + var field = property.GetCustomAttributes(typeof(FieldSelector), true).FirstOrDefault() as FieldSelector; if (field == null) { @@ -149,17 +149,17 @@ public ModelDefine() } Selector = selector; - TableInfo = tableInfo; + Table = tableInfo; Fields = fields; - TargetUrlsSelectors = targetUrlsSelectors; + TargetRequestSelectors = targetUrlsSelectors; SharedValueSelectors = sharedValueSelectors; Take = take; TakeFromHead = takeFromHead; - if (TableInfo != null) + if (Table != null) { - Identity = $"{TableInfo.Database}.{TableInfo.FullName}"; + Identity = $"{Table.Database}.{Table.FullName}"; } else { @@ -170,7 +170,7 @@ public ModelDefine() if (columns.Count == 0) { - throw new ModelException($"Columns is necessary for {name}"); + throw new ArgumentException($"Columns is necessary for {name}"); } if (tableInfo != null) @@ -181,13 +181,13 @@ public ModelDefine() { if (columns.All(c => c.Name != column)) { - throw new ModelException("Columns set to update are not a property of your entity"); + throw new ArgumentException("Columns set to update are not a property of your entity"); } } if (tableInfo.UpdateColumns.Length == 0) { - throw new ModelException("There is no column need update"); + throw new ArgumentException("There is no column need update"); } } @@ -200,7 +200,7 @@ public ModelDefine() if (items.Count == 0) { - throw new ModelException("Index should contain more than a column"); + throw new ArgumentException("Index should contain more than a column"); } foreach (var item in items) @@ -208,12 +208,12 @@ public ModelDefine() var column = columns.FirstOrDefault(c => c.Name == item); if (column == null) { - throw new ModelException("Columns set as index are not a property of your entity"); + throw new ArgumentException("Columns set as index are not a property of your entity"); } if (column.DataType == DataType.String && (column.Length <= 0 || column.Length > 256)) { - throw new ModelException("Column length of index should not large than 256"); + throw new ArgumentException("Column length of index should not large than 256"); } } @@ -230,7 +230,7 @@ public ModelDefine() if (items.Count == 0) { - throw new ModelException("Unique should contain more than a column"); + throw new ArgumentException("Unique should contain more than a column"); } foreach (var item in items) @@ -238,12 +238,12 @@ public ModelDefine() var column = columns.FirstOrDefault(c => c.Name == item); if (column == null) { - throw new ModelException("Columns set as unique are not a property of your entity"); + throw new ArgumentException("Columns set as unique are not a property of your entity"); } if (column.DataType == DataType.String && (column.Length <= 0 || column.Length > 256)) { - throw new ModelException("Column length of unique should not large than 256"); + throw new ArgumentException("Column length of unique should not large than 256"); } } diff --git a/src/DotnetSpider.Extraction/Model/ModelException.cs b/src/DotnetSpider.Extraction/Model/ModelException.cs deleted file mode 100644 index 433a94393..000000000 --- a/src/DotnetSpider.Extraction/Model/ModelException.cs +++ /dev/null @@ -1,20 +0,0 @@ -using System; - -namespace DotnetSpider.Extraction.Model -{ - public class ModelException : Exception - { - /// - /// 构造方法 - /// - /// 异常信息 - public ModelException(string msg) : base(msg) { } - - /// - /// 构造方法 - /// - /// 异常信息 - /// The exception that is the cause of the current exception, or a null reference (Nothing in Visual Basic) if no inner exception is specified. - public ModelException(string msg, Exception e) : base(msg, e) { } - } -} diff --git a/src/DotnetSpider.Extraction/Model/ModelExtractor.cs b/src/DotnetSpider.Extraction/Model/ModelExtractor.cs index 75127a110..d693e5244 100644 --- a/src/DotnetSpider.Extraction/Model/ModelExtractor.cs +++ b/src/DotnetSpider.Extraction/Model/ModelExtractor.cs @@ -15,7 +15,7 @@ public class ModelExtractor : IModelExtractor /// 可查询对象 /// 解析模型 /// 实体对象 - public virtual IEnumerable Extract(Selectable selectable, IModel model) + public virtual IList Extract(Selectable selectable, IModel model) { List results = new List(); if (selectable.Properties == null) @@ -102,34 +102,9 @@ private Dictionary ExtractObject(IModel model, ISelectable obj, } return dataObject; - - //if (dataObject != null && EntityDefine.LinkToNexts != null) - //{ - // foreach (var targetUrl in EntityDefine.LinkToNexts) - // { - // Dictionary extras = new Dictionary(); - // if (targetUrl.Extras != null) - // { - // foreach (var extra in targetUrl.Extras) - // { - // extras.Add(extra, result[extra]); - // } - // } - // Dictionary allExtras = new Dictionary(); - // foreach (var extra in page.Request.Extras.Union(extras)) - // { - // allExtras.Add(extra.Key, extra.Value); - // } - // var value = result[targetUrl.PropertyName]; - // if (value != null) - // { - // page.AddTargetRequest(new Request(value.ToString(), allExtras)); - // } - // } - //} } - private string ExtractField(Field field, ISelectable item, Selectable root, int index) + private string ExtractField(FieldSelector field, ISelectable item, Selectable root, int index) { if (field == null) { @@ -226,7 +201,7 @@ public ModelExtractor() } } - public override IEnumerable Extract(Selectable response, IModel model) + public override IList Extract(Selectable response, IModel model) { var items = base.Extract(response, model)?.ToList(); @@ -258,7 +233,7 @@ public override IEnumerable Extract(Selectable response, IModel model) } catch { - throw new ModelException($"Convert data {oldValue} to {valueType.Name} failed."); + throw new ExtractionException($"Convert data {oldValue} to {valueType.Name} failed."); } } } diff --git a/src/DotnetSpider.Extraction/Model/SelectorUtil.cs b/src/DotnetSpider.Extraction/Model/SelectorExtensions.cs similarity index 88% rename from src/DotnetSpider.Extraction/Model/SelectorUtil.cs rename to src/DotnetSpider.Extraction/Model/SelectorExtensions.cs index d0e7b789b..0d871fe45 100644 --- a/src/DotnetSpider.Extraction/Model/SelectorUtil.cs +++ b/src/DotnetSpider.Extraction/Model/SelectorExtensions.cs @@ -2,7 +2,7 @@ namespace DotnetSpider.Extraction.Model { - public static class SelectorUtil + public static class SelectorExtensions { /// /// 把BaseSelector转换成真正的查询器 @@ -44,7 +44,7 @@ public static ISelector ToSelector(this Selector selector) { return Selectors.Regex(expression, group); } - throw new ModelException($"Regex argument should be a number set to group: {selector}."); + throw new ArgumentException($"Regex argument should be a number set to group: {selector}."); } } case SelectorType.XPath: @@ -54,7 +54,7 @@ public static ISelector ToSelector(this Selector selector) } default: { - throw new ModelException($"Selector {selector} unsupoort."); + throw new NotSupportedException($"{selector} unsupoort."); } } } diff --git a/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs b/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs new file mode 100644 index 000000000..d9ee4d070 --- /dev/null +++ b/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs @@ -0,0 +1,33 @@ +namespace DotnetSpider.Extraction.Model +{ + public enum TableNamePostfix + { + None, + + /// + /// 表名的后缀为星期一的时间 + /// + Monday, + + /// + /// 表名的后缀为今天的时间 {name}_20171212 + /// + Today, + + /// + /// 表名的后缀为当月的第一天 {name}_20171201 + /// + FirstDayOfTheMonth, + + /// + /// 表名的后缀为当月 {name}_201712 + /// + Month, + + /// + /// 表名的后缀为上个月 {name}_201711 + /// + LastMonth + } + +} diff --git a/src/DotnetSpider.HtmlAgilityPack.Css/DotnetSpider.HtmlAgilityPack.Css.csproj b/src/DotnetSpider.HtmlAgilityPack.Css/DotnetSpider.HtmlAgilityPack.Css.csproj index 3bc025fed..080f508b8 100644 --- a/src/DotnetSpider.HtmlAgilityPack.Css/DotnetSpider.HtmlAgilityPack.Css.csproj +++ b/src/DotnetSpider.HtmlAgilityPack.Css/DotnetSpider.HtmlAgilityPack.Css.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.HtmlAgilityPack.Css Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Proxy/DotnetSpider.Proxy.csproj b/src/DotnetSpider.Proxy/DotnetSpider.Proxy.csproj index f18333946..fdc49e7e1 100644 --- a/src/DotnetSpider.Proxy/DotnetSpider.Proxy.csproj +++ b/src/DotnetSpider.Proxy/DotnetSpider.Proxy.csproj @@ -4,7 +4,7 @@ net40;net45;netstandard2.0 true true - 3.0.0-beta1 + 3.0.0 zlzforever@163.com; DotnetSpider.Proxy Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Sample/TestSpider.cs b/src/DotnetSpider.Sample/TestSpider.cs index 5d0f41b20..4b2853802 100644 --- a/src/DotnetSpider.Sample/TestSpider.cs +++ b/src/DotnetSpider.Sample/TestSpider.cs @@ -16,7 +16,7 @@ public TestSpider() : base("TestSpider") { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary { { "Keyword", word } }); @@ -28,31 +28,31 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class BaiduSearchEntry : BaseEntity { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href", Length = 0)] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href", Length = 0)] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText, Length = 0)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText, Length = 0)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText, Length = 0)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText, Length = 0)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] diff --git a/src/DotnetSpider.Sample/docs/AfterDownloadCompleteHandlerSpider.cs b/src/DotnetSpider.Sample/docs/AfterDownloadCompleteHandlerSpider.cs index 1c995c8cc..6cf270e04 100644 --- a/src/DotnetSpider.Sample/docs/AfterDownloadCompleteHandlerSpider.cs +++ b/src/DotnetSpider.Sample/docs/AfterDownloadCompleteHandlerSpider.cs @@ -25,7 +25,7 @@ public SinaNewsSpider() : base(new Site()) { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { AddStartUrl($"http://api.search.sina.com.cn/?c=news&t=&q=赵丽颖&pf=2136012948&ps=2130770082&page=0&stime={DateTime.Now.AddYears(-7).AddDays(-1).ToString("yyyy-MM-dd")}&etime={DateTime.Now.AddDays(1).ToString("yyyy-MM-dd")}&sort=rel&highlight=1&num=10&ie=utf-8&callback=jQuery1720001955628746606708_1508996230766&_=1508996681484", new Dictionary { { "keyword", "赵丽颖" } }); AddPipeline(new ConsoleEntityPipeline()); @@ -102,25 +102,25 @@ private string Zxj_ReplaceHtml(string patrn, string strRep, string content) [EntitySelector(Expression = "$.result.list[*]", Type = SelectorType.JsonPath)] class SinaNews : BaseEntity { - [Field(Expression = "$.origin_title", Type = SelectorType.JsonPath, Length = 80, Option = FieldOptions.InnerText)] + [FieldSelector(Expression = "$.origin_title", Type = SelectorType.JsonPath, Length = 80, Option = FieldOptions.InnerText)] public string Title { get; set; } - [Field(Expression = "$.url", Type = SelectorType.JsonPath, Length = 230)] + [FieldSelector(Expression = "$.url", Type = SelectorType.JsonPath, Length = 230)] public string Link { get; set; } - [Field(Expression = "keyword", Type = SelectorType.Enviroment, Length = 20)] + [FieldSelector(Expression = "keyword", Type = SelectorType.Enviroment, Length = 20)] public string Keywords { get; set; } - [Field(Expression = "$.intro", Type = SelectorType.JsonPath, Length = 300, Option = FieldOptions.InnerText)] + [FieldSelector(Expression = "$.intro", Type = SelectorType.JsonPath, Length = 300, Option = FieldOptions.InnerText)] public string Summary { get; set; } - [Field(Expression = "$.media", Type = SelectorType.JsonPath, Length = 20)] + [FieldSelector(Expression = "$.media", Type = SelectorType.JsonPath, Length = 20)] public string NewsFrom { get; set; } - [Field(Expression = "$.datetime", Type = SelectorType.JsonPath, Length = 20)] + [FieldSelector(Expression = "$.datetime", Type = SelectorType.JsonPath, Length = 20)] public string PublishTime { get; set; } - [Field(Expression = "$.cid", Type = SelectorType.JsonPath, Length = 20)] + [FieldSelector(Expression = "$.cid", Type = SelectorType.JsonPath, Length = 20)] public string Cid { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/AutoIncrementTargetRequestExtractorrSpider.cs b/src/DotnetSpider.Sample/docs/AutoIncrementTargetRequestExtractorrSpider.cs index d43fe4153..a4b8f8c17 100644 --- a/src/DotnetSpider.Sample/docs/AutoIncrementTargetRequestExtractorrSpider.cs +++ b/src/DotnetSpider.Sample/docs/AutoIncrementTargetRequestExtractorrSpider.cs @@ -22,7 +22,7 @@ public CnblogsSpider() : base(new Site()) { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Identity = ("cnblogs_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); AddStartUrl("https://news.cnblogs.com/n/page/1"); @@ -34,10 +34,10 @@ protected override void MyInit(params string[] arguments) [TableInfo("cnblogs", "news")] class News : BaseEntity { - [Field(Expression = ".//h2[@class='news_entry']")] + [FieldSelector(Expression = ".//h2[@class='news_entry']")] public string Name { get; set; } - [Field(Expression = ".//span[@class='view']")] + [FieldSelector(Expression = ".//span[@class='view']")] public string View { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/CookiesSpider.cs b/src/DotnetSpider.Sample/docs/CookiesSpider.cs index d49fda2f3..d9b6c692d 100644 --- a/src/DotnetSpider.Sample/docs/CookiesSpider.cs +++ b/src/DotnetSpider.Sample/docs/CookiesSpider.cs @@ -1,6 +1,7 @@ using DotnetSpider.Common; using DotnetSpider.Extension; using DotnetSpider.Extension.Pipeline; +using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; using DotnetSpider.Extraction.Model.Formatter; using System.Collections.Generic; @@ -22,7 +23,7 @@ public CookiesSpider() : base(new Site { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Downloader.AddCookies("sid=dea284fc36c24e8cbcd447343d7b8a4e; sn=DD962248; ctid=000000; ctnm=%E5%8F%A4%E9%95%87%E7%81%AF%E9%A5%B0%E6%89%B9%E5%8F%91; ctpv=%E5%B9%BF%E4%B8%9C; JSESSIONID=acbBqFfOD4I63d9PziDvv; DDENG=c4fc08ae2e3ba3efeddbc667c2f45e615a85e80009169501dc244a03e87908aa61146548b97ed9c7dc07af23bfd80bff5008f8c8867a9165d4bd2732aca0db7dedae2e042d3968fcad1150f36be242e8a32a3f59db2a0b39216a59f1628508c5799644532a9d99925f9841b3c13a1f97; userId=10003379; previousUser=%E5%A4%95%E7%8E%89; Hm_lvt_9e33f153f28be198970d205d90a24f28=1466146335; Hm_lpvt_9e33f153f28be198970d205d90a24f28=1466146392; Hm_lvt_54b4cb498afd05463ab4611b38a6f289=1466146335; Hm_lpvt_54b4cb498afd05463ab4611b38a6f289=1466146392; CNZZDATA1256982382=395301521-1466143554-%7C1466143554", "www.ddeng.com"); @@ -35,7 +36,7 @@ protected override void MyInit(params string[] arguments) [TableInfo("test", "ddeng_corp", TableNamePostfix.Today)] class Corp { - [Field(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/p[1]/strong", Length = 100)] + [FieldSelector(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/p[1]/strong", Length = 100)] public string Name { get; set; } [ReplaceFormatter(NewValue = "", OldValue = "\r")] @@ -44,7 +45,7 @@ class Corp [ReplaceFormatter(NewValue = "", OldValue = "\n")] [ReplaceFormatter(NewValue = "", OldValue = "\"")] [ReplaceFormatter(NewValue = "", OldValue = " ")] - [Field(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/ul/li[2]/div", Option = FieldOptions.InnerText, Length = 100)] + [FieldSelector(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/ul/li[2]/div", Option = FieldOptions.InnerText, Length = 100)] public string Phone { get; set; } [ReplaceFormatter(NewValue = "", OldValue = "\r")] @@ -54,10 +55,10 @@ class Corp [ReplaceFormatter(NewValue = "", OldValue = "\"")] [ReplaceFormatter(NewValue = "", OldValue = " ")] [ReplaceFormatter(NewValue = "", OldValue = "地址:")] - [Field(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/ul/li[3]", Option = FieldOptions.InnerText, Length = 200)] + [FieldSelector(Expression = "/html/body/div[4]/div[2]/div[3]/div[1]/ul/li[3]", Option = FieldOptions.InnerText, Length = 200)] public string Address { get; set; } - [Field(Expression = ".")] + [FieldSelector(Expression = ".")] public string Html { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/CtripCitySpider.cs b/src/DotnetSpider.Sample/docs/CtripCitySpider.cs index 66a80d1ae..569b8c746 100644 --- a/src/DotnetSpider.Sample/docs/CtripCitySpider.cs +++ b/src/DotnetSpider.Sample/docs/CtripCitySpider.cs @@ -25,7 +25,7 @@ public CtripCitySpider() : base(new Site { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { AddStartUrl("http://www.ctrip.com/"); AddEntityType(); @@ -36,16 +36,16 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "//div[@class='city_item']//a")] class CtripCity { - [Field(Expression = ".", Length = 100)] + [FieldSelector(Expression = ".", Length = 100)] public string name { get; set; } - [Field(Expression = "./@title", Length = 100)] + [FieldSelector(Expression = "./@title", Length = 100)] public string title { get; set; } - [Field(Expression = "./@data-id", Length = 100)] + [FieldSelector(Expression = "./@data-id", Length = 100)] public string city_id { get; set; } - [Field(Expression = "Today", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Today", Type = SelectorType.Enviroment)] public DateTime run_id { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/CustmizeProcessorAndPipelineSpider.cs b/src/DotnetSpider.Sample/docs/CustmizeProcessorAndPipelineSpider.cs index d81fcfbf1..8cf80d128 100644 --- a/src/DotnetSpider.Sample/docs/CustmizeProcessorAndPipelineSpider.cs +++ b/src/DotnetSpider.Sample/docs/CustmizeProcessorAndPipelineSpider.cs @@ -37,7 +37,7 @@ private class YoukuPipeline : BasePipeline { private long _count = 0; - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { foreach (var resultItem in resultItems) { diff --git a/src/DotnetSpider.Sample/docs/CustomizeFormatterSpider.cs b/src/DotnetSpider.Sample/docs/CustomizeFormatterSpider.cs index 12d9f43d6..934326a55 100644 --- a/src/DotnetSpider.Sample/docs/CustomizeFormatterSpider.cs +++ b/src/DotnetSpider.Sample/docs/CustomizeFormatterSpider.cs @@ -23,7 +23,7 @@ public static void Run() [TaskName("baidu_search")] class BaiduSearchSpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { EmptySleepTime = 1000; var word = "可乐|雪碧"; @@ -36,18 +36,18 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class Result : BaseEntity { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [NullFormatter] public string Website { get; set; } } diff --git a/src/DotnetSpider.Sample/docs/CustomizeInitSpider.cs b/src/DotnetSpider.Sample/docs/CustomizeInitSpider.cs index 80a589323..5189f9773 100644 --- a/src/DotnetSpider.Sample/docs/CustomizeInitSpider.cs +++ b/src/DotnetSpider.Sample/docs/CustomizeInitSpider.cs @@ -7,7 +7,7 @@ // [TaskName("CustomizeInitSpider")] // public class CustomizeInitSpider : CommonSpider // { -// protected override void MyInit(params string[] arguments) +// protected override void OnInit(params string[] arguments) // { // Identity = Guid.NewGuid().ToString("N"); // } diff --git a/src/DotnetSpider.Sample/docs/DataHandlerSpider.cs b/src/DotnetSpider.Sample/docs/DataHandlerSpider.cs index ec2b961f5..2ea5a52fd 100644 --- a/src/DotnetSpider.Sample/docs/DataHandlerSpider.cs +++ b/src/DotnetSpider.Sample/docs/DataHandlerSpider.cs @@ -101,7 +101,7 @@ public Spider() : base(new Site { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Downloader.AddAfterDownloadCompleteHandler(new CutoutHandler("g_page_config = {", "g_srp_loadCss();", 16, 22)); AddBeforeProcessor(new TargetRequestHandler(new AutoIncrementTargetRequestExtractor("&s=0", 44))); @@ -116,22 +116,22 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "$.mods.itemlist.data.auctions[*]", Type = SelectorType.JsonPath)] private class TaobaoItem { - [Field(Expression = "$.view_price", Type = SelectorType.JsonPath, Length = 50)] + [FieldSelector(Expression = "$.view_price", Type = SelectorType.JsonPath, Length = 50)] public string price { get; set; } - [Field(Expression = "$.category", Type = SelectorType.JsonPath, Length = 20)] + [FieldSelector(Expression = "$.category", Type = SelectorType.JsonPath, Length = 20)] public string cat { get; set; } - [Field(Expression = "$.view_sales", Type = SelectorType.JsonPath, Length = 50)] + [FieldSelector(Expression = "$.view_sales", Type = SelectorType.JsonPath, Length = 50)] [ReplaceFormatter(NewValue = "", OldValue = "付款")] [ReplaceFormatter(NewValue = "", OldValue = "收货")] [ReplaceFormatter(NewValue = "", OldValue = "人")] public string sold { get; set; } - [Field(Expression = "$.nid", Type = SelectorType.JsonPath, Length = 50)] + [FieldSelector(Expression = "$.nid", Type = SelectorType.JsonPath, Length = 50)] public string item_id { get; set; } - [Field(Expression = "$.user_id", Type = SelectorType.JsonPath, Length = 50)] + [FieldSelector(Expression = "$.user_id", Type = SelectorType.JsonPath, Length = 50)] public string user_id { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/DbRequestBuilderSpider.cs b/src/DotnetSpider.Sample/docs/DbRequestBuilderSpider.cs index 2a46fcbec..e8409a0ee 100644 --- a/src/DotnetSpider.Sample/docs/DbRequestBuilderSpider.cs +++ b/src/DotnetSpider.Sample/docs/DbRequestBuilderSpider.cs @@ -4,13 +4,14 @@ using DotnetSpider.Extension; using DotnetSpider.Extension.Pipeline; using DotnetSpider.Extraction; +using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; namespace DotnetSpider.Sample.docs { public class DbRequestBuilderSpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Downloader.AddAfterDownloadCompleteHandler(new CutoutHandler("json(", ");", 5, 0)); AddPipeline(new ConsoleEntityPipeline()); @@ -25,10 +26,10 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "$.[*]", Type = SelectorType.JsonPath)] class Item { - [Field(Expression = "$.pid", Type = SelectorType.JsonPath, Length = 25)] + [FieldSelector(Expression = "$.pid", Type = SelectorType.JsonPath, Length = 25)] public string Sku { get; set; } - [Field(Expression = "$.shopId", Type = SelectorType.JsonPath)] + [FieldSelector(Expression = "$.shopId", Type = SelectorType.JsonPath)] public int ShopId { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/DefaultMySqlPipelineSpider.cs b/src/DotnetSpider.Sample/docs/DefaultMySqlPipelineSpider.cs index b0dbdecc3..2c45241b4 100644 --- a/src/DotnetSpider.Sample/docs/DefaultMySqlPipelineSpider.cs +++ b/src/DotnetSpider.Sample/docs/DefaultMySqlPipelineSpider.cs @@ -7,13 +7,13 @@ namespace DotnetSpider.Sample.docs { - public class DefaultMySqlPipelineSpider : CustomizedSpider + public class DefaultMySqlPipelineSpider : Spider { public DefaultMySqlPipelineSpider() : base(new Site()) { } - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddPipeline(new DefaultMySqlPipeline(Env.DataConnectionString, "baidu", "mysql_baidu_search")); diff --git a/src/DotnetSpider.Sample/docs/EntityModelSpider.cs b/src/DotnetSpider.Sample/docs/EntityModelSpider.cs index ad9690ef0..e5decf22a 100644 --- a/src/DotnetSpider.Sample/docs/EntityModelSpider.cs +++ b/src/DotnetSpider.Sample/docs/EntityModelSpider.cs @@ -22,7 +22,7 @@ public static void Run() private class Spider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary { { "Keyword", word } }); @@ -34,31 +34,31 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class BaiduSearchEntry : BaseEntity { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] diff --git a/src/DotnetSpider.Sample/docs/ModelSpider.cs b/src/DotnetSpider.Sample/docs/ModelSpider.cs index d61334a3a..f5efcaefb 100644 --- a/src/DotnetSpider.Sample/docs/ModelSpider.cs +++ b/src/DotnetSpider.Sample/docs/ModelSpider.cs @@ -17,12 +17,12 @@ public static void Run() var selector = new Selector("//div[@class='yk-pack pack-film']"); var fields = new[] { - new Field(".//img[@class='quic']/@alt","name"), - new Field("index", "index", SelectorType.Enviroment, DataType.Int), - new Field("", "id", SelectorType.Enviroment, DataType.Int){ IsPrimary=true}, + new FieldSelector(".//img[@class='quic']/@alt","name"), + new FieldSelector("index", "index", SelectorType.Enviroment, DataType.Int), + new FieldSelector("", "id", SelectorType.Enviroment, DataType.Int){ IsPrimary=true}, }; - var TargetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); - var model = new ModelDefine(selector, fields, table, TargetRequestSelector); + var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); + var model = new ModelDefinition(selector, fields, table, targetRequestSelector); // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8" }; diff --git a/src/DotnetSpider.Sample/docs/MultiEntityModelSpider.cs b/src/DotnetSpider.Sample/docs/MultiEntityModelSpider.cs index 5e5a3beef..09d633db4 100644 --- a/src/DotnetSpider.Sample/docs/MultiEntityModelSpider.cs +++ b/src/DotnetSpider.Sample/docs/MultiEntityModelSpider.cs @@ -16,7 +16,7 @@ public static void Run() private class CnblogsSpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Identity = ("cnblogs_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); AddStartUrl("http://www.cnblogs.com"); @@ -30,16 +30,16 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "//div[@class='post_item']")] class News : BaseEntity { - [Field(Expression = ".//a[@class='titlelnk']")] + [FieldSelector(Expression = ".//a[@class='titlelnk']")] public string Name { get; set; } - [Field(Expression = ".//div[@class='post_item_foot']/a[1]")] + [FieldSelector(Expression = ".//div[@class='post_item_foot']/a[1]")] public string Author { get; set; } - [Field(Expression = ".//div[@class='post_item_foot']/text()")] + [FieldSelector(Expression = ".//div[@class='post_item_foot']/text()")] public string PublishTime { get; set; } - [Field(Expression = ".//a[@class='titlelnk']/@href")] + [FieldSelector(Expression = ".//a[@class='titlelnk']/@href")] public string Url { get; set; } } @@ -47,16 +47,16 @@ class News : BaseEntity [EntitySelector(Expression = "//div[@class='post_item']")] class BlogSumary : BaseEntity { - [Field(Expression = ".//a[@class='titlelnk']")] + [FieldSelector(Expression = ".//a[@class='titlelnk']")] public string Name { get; set; } - [Field(Expression = ".//div[@class='post_item_foot']/a[1]")] + [FieldSelector(Expression = ".//div[@class='post_item_foot']/a[1]")] public string Author { get; set; } - [Field(Expression = ".//div[@class='post_item_foot']/text()")] + [FieldSelector(Expression = ".//div[@class='post_item_foot']/text()")] public string PublishTime { get; set; } - [Field(Expression = ".//a[@class='titlelnk']/@href")] + [FieldSelector(Expression = ".//a[@class='titlelnk']/@href")] public string Url { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/MultiProcessorsSpider.cs b/src/DotnetSpider.Sample/docs/MultiProcessorsSpider.cs index 28cf0380d..bdf4bf86c 100644 --- a/src/DotnetSpider.Sample/docs/MultiProcessorsSpider.cs +++ b/src/DotnetSpider.Sample/docs/MultiProcessorsSpider.cs @@ -40,7 +40,7 @@ private class MyPipeline : BasePipeline private static long blogSumaryCount = 0; private static long newsCount = 0; - public override void Process(IEnumerable resultItems, ILogger logger, dynamic sender = null) + public override void Process(IList resultItems, ILogger logger, dynamic sender = null) { foreach (var resultItem in resultItems) { diff --git a/src/DotnetSpider.Sample/docs/MySqlEntityPipelineSpider.cs b/src/DotnetSpider.Sample/docs/MySqlEntityPipelineSpider.cs index 0f37bae0a..a586ccd2b 100644 --- a/src/DotnetSpider.Sample/docs/MySqlEntityPipelineSpider.cs +++ b/src/DotnetSpider.Sample/docs/MySqlEntityPipelineSpider.cs @@ -18,7 +18,7 @@ public static void Run() private class Spider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { var word = "可乐|雪碧"; AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary { { "Keyword", word } }); @@ -30,31 +30,31 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)] class Result : BaseEntity { - [Field(Expression = "Keyword", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Keyword", Type = SelectorType.Enviroment)] public string Keyword { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] public string Title { get; set; } - [Field(Expression = ".//h3[@class='c-title']/a/@href")] + [FieldSelector(Expression = ".//h3[@class='c-title']/a/@href")] public string Url { get; set; } - [Field(Expression = ".//div/p[@class='c-author']/text()")] + [FieldSelector(Expression = ".//div/p[@class='c-author']/text()")] [ReplaceFormatter(NewValue = "-", OldValue = " ")] public string Website { get; set; } - [Field(Expression = ".//div/span/a[@class='c-cache']/@href")] + [FieldSelector(Expression = ".//div/span/a[@class='c-cache']/@href")] public string Snapshot { get; set; } - [Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] public string Details { get; set; } - [Field(Expression = ".", Option = FieldOptions.InnerText)] + [FieldSelector(Expression = ".", Option = FieldOptions.InnerText)] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = "", OldValue = "")] [ReplaceFormatter(NewValue = " ", OldValue = " ")] diff --git a/src/DotnetSpider.Sample/docs/OneForAllSpider.cs b/src/DotnetSpider.Sample/docs/OneForAllSpider.cs index 9e798272f..c82edfd01 100644 --- a/src/DotnetSpider.Sample/docs/OneForAllSpider.cs +++ b/src/DotnetSpider.Sample/docs/OneForAllSpider.cs @@ -16,7 +16,7 @@ public static void Run() class Spider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { AddStartUrl("http://www.jd.com/allSort.aspx"); AddEntityType(); @@ -28,12 +28,12 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = ".//div[@class='items']//a")] class Category { - [Field(Expression = ".")] + [FieldSelector(Expression = ".")] public string CategoryName { get; set; } [ToNext(Extras = new[] { "CategoryName" })] [RegexAppendFormatter(Pattern = "http://list.jd.com/list.html\\?cat=[0-9]+", AppendValue = "&page=1&trans=1&JL=6_0_0")] - [Field(Expression = "./@href")] + [FieldSelector(Expression = "./@href")] public string Url { get; set; } } @@ -41,17 +41,17 @@ class Category [TargetRequestSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] class TmpProduct { - [Field(Expression = "CategoryName", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "CategoryName", Type = SelectorType.Enviroment, Length = 100)] public string CategoryName { get; set; } [ToNext(Extras = new[] { "CategoryName", "Sku", "Name", "Url" })] - [Field(Expression = "./div[@class='p-name']/a[1]/@href")] + [FieldSelector(Expression = "./div[@class='p-name']/a[1]/@href")] public string Url { get; set; } - [Field(Expression = ".//div[@class='p-name']/a/em", Length = 100)] + [FieldSelector(Expression = ".//div[@class='p-name']/a/em", Length = 100)] public string Name { get; set; } - [Field(Expression = "./@data-sku", Length = 100)] + [FieldSelector(Expression = "./@data-sku", Length = 100)] public string Sku { get; set; } } @@ -59,24 +59,24 @@ class TmpProduct [TableInfo("jd", "jd_product", Uniques = new[] { "Sku" }, Indexs = new[] { "Sku" })] class JdProduct { - [Field(Expression = "Name", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Name", Type = SelectorType.Enviroment, Length = 100)] public string Name { get; set; } - [Field(Expression = "Sku", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "Sku", Type = SelectorType.Enviroment, Length = 100)] public string Sku { get; set; } - [Field(Expression = "Url", Type = SelectorType.Enviroment)] + [FieldSelector(Expression = "Url", Type = SelectorType.Enviroment)] public string Url { get; set; } - [Field(Expression = "CategoryName", Type = SelectorType.Enviroment, Length = 100)] + [FieldSelector(Expression = "CategoryName", Type = SelectorType.Enviroment, Length = 100)] public string CategoryName { get; set; } - [Field(Expression = ".//a[@class='name']", Length = 100)] + [FieldSelector(Expression = ".//a[@class='name']", Length = 100)] public string ShopName { get; set; } [StringFormater(Format = "http:{0}")] [Download] - [Field(Expression = "//*[@class='brand-logo']/a[1]/img[1]/@src", IgnoreStore = true)] + [FieldSelector(Expression = "//*[@class='brand-logo']/a[1]/img[1]/@src", IgnoreStore = true)] public string Logo { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/RegexSelectorSpider.cs b/src/DotnetSpider.Sample/docs/RegexSelectorSpider.cs index a89139ded..1a5309fc2 100644 --- a/src/DotnetSpider.Sample/docs/RegexSelectorSpider.cs +++ b/src/DotnetSpider.Sample/docs/RegexSelectorSpider.cs @@ -7,7 +7,7 @@ namespace DotnetSpider.Sample.docs { public class RegexSelectorSpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { AddStartUrl("http://www.cnblogs.com"); AddPipeline(new ConsoleEntityPipeline()); @@ -16,7 +16,7 @@ protected override void MyInit(params string[] arguments) class HomePage { - [Field(Expression = "", Type = SelectorType.Regex, Arguments = "1")] + [FieldSelector(Expression = "", Type = SelectorType.Regex, Arguments = "1")] public string Category { get; set; } } } diff --git a/src/DotnetSpider.Sample/docs/WebDriverDownloaderSpider.cs b/src/DotnetSpider.Sample/docs/WebDriverDownloaderSpider.cs index 93de5e7bb..c3429ef51 100644 --- a/src/DotnetSpider.Sample/docs/WebDriverDownloaderSpider.cs +++ b/src/DotnetSpider.Sample/docs/WebDriverDownloaderSpider.cs @@ -3,6 +3,7 @@ using DotnetSpider.Extension.Downloader; using DotnetSpider.Extension.Pipeline; using DotnetSpider.Extraction; +using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; using System.Collections.Generic; @@ -10,7 +11,7 @@ namespace DotnetSpider.Sample.docs { public class WebDriverDownloaderSpider : EntitySpider { - protected override void MyInit(params string[] arguments) + protected override void OnInit(params string[] arguments) { Downloader = new WebDriverDownloader(Browser.Chrome); AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary { { "name", "手机" }, { "cat3", "655" } }); @@ -23,31 +24,31 @@ protected override void MyInit(params string[] arguments) [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] class Product { - [Field(Expression = "name", Type = SelectorType.Enviroment, Length = 20)] + [FieldSelector(Expression = "name", Type = SelectorType.Enviroment, Length = 20)] public string CategoryName { get; set; } - [Field(Expression = "cat3", Type = SelectorType.Enviroment, Length = 20)] + [FieldSelector(Expression = "cat3", Type = SelectorType.Enviroment, Length = 20)] public int CategoryId { get; set; } - [Field(Expression = "./div[1]/a/@href", Length = 20)] + [FieldSelector(Expression = "./div[1]/a/@href", Length = 20)] public string Url { get; set; } - [Field(Expression = "./@data-sku", Length = 20)] + [FieldSelector(Expression = "./@data-sku", Length = 20)] public string Sku { get; set; } - [Field(Expression = "./div[5]/strong/a", Length = 20)] + [FieldSelector(Expression = "./div[5]/strong/a", Length = 20)] public long CommentsCount { get; set; } - [Field(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 20)] + [FieldSelector(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 20)] public string ShopName { get; set; } - [Field(Expression = ".//div[@class='p-name']/a/em", Length = 20)] + [FieldSelector(Expression = ".//div[@class='p-name']/a/em", Length = 20)] public string Name { get; set; } - [Field(Expression = "./@venderid", Length = 20)] + [FieldSelector(Expression = "./@venderid", Length = 20)] public string VenderId { get; set; } - [Field(Expression = "./@jdzy_shop_id", Length = 20)] + [FieldSelector(Expression = "./@jdzy_shop_id", Length = 20)] public string JdzyShopId { get; set; } } }