From c6e032e2c667e48b9110f2b68a1b330f347ca392 Mon Sep 17 00:00:00 2001 From: Lewis Zou Date: Wed, 29 May 2024 11:24:47 +0800 Subject: [PATCH] Update README.md --- README.md | 103 +++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index f18f54a8..ddb1b646 100644 --- a/README.md +++ b/README.md @@ -75,24 +75,65 @@ https://github.com/dotnetcore/DotnetSpider/wiki [View complete Codes](https://github.com/zlzforever/DotnetSpider/blob/master/src/DotnetSpider.Sample/samples/EntitySpider.cs) ````csharp -public class EntitySpider : Spider +public class EntitySpider( + IOptions options, + DependenceServices services, + ILogger logger) + : Spider(options, services, logger) { - public EntitySpider(IOptions options, SpiderServices services, ILogger logger) : base( - options, services, logger) + public static async Task RunAsync() + { + var builder = Builder.CreateDefaultBuilder(options => + { + options.Speed = 1; + }); + builder.UseSerilog(); + builder.IgnoreServerCertificateError(); + await builder.Build().RunAsync(); + } + + public static async Task RunMySqlQueueAsync() + { + var builder = Builder.CreateDefaultBuilder(options => + { + options.Speed = 1; + }); + builder.UseSerilog(); + builder.IgnoreServerCertificateError(); + builder.UseMySqlQueueBfsScheduler((context, options) => + { + options.ConnectionString = context.Configuration["SchedulerConnectionString"]; + }); + await builder.Build().RunAsync(); + } + + protected override async Task InitializeAsync(CancellationToken stoppingToken = default) { + AddDataFlow>(); + AddDataFlow(GetDefaultStorage); + await AddRequestsAsync( + new Request( + "https://news.cnblogs.com/n/page/1", new Dictionary { { "网站", "博客园" } })); } - #region Nested type: CnblogsEntry + protected override SpiderId GenerateSpiderId() + { + return new(ObjectId.CreateId().ToString(), "博客园"); + } [Schema("cnblogs", "news")] [EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)] [GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)] - [FollowRequestSelector(XPaths = new[] - { - "//div[@class='pager']" - })] + [GlobalValueSelector(Expression = "//title", Name = "Title", Type = SelectorType.XPath)] + [FollowRequestSelector(Expressions = ["//div[@class='pager']"])] public class CnblogsEntry : EntityBase { + protected override void Configure() + { + HasIndex(x => x.Title); + HasIndex(x => new { x.WebSite, x.Guid }, true); + } + public int Id { get; set; } [Required] @@ -106,7 +147,7 @@ public class EntitySpider : Spider public string WebSite { get; set; } [StringLength(200)] - [ValueSelector(Expression = "//title")] + [ValueSelector(Expression = "Title", Type = SelectorType.Environment)] [ReplaceFormatter(NewValue = "", OldValue = " - 博客园")] public string Title { get; set; } @@ -121,55 +162,15 @@ public class EntitySpider : Spider public string Url { get; set; } [ValueSelector(Expression = ".//div[@class='entry_summary']")] + [TrimFormatter] public string PlainText { get; set; } [ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)] public DateTime CreationTime { get; set; } - - protected override void Configure() - { - HasIndex(x => x.Title); - HasIndex(x => new - { - x.WebSite, - x.Guid - }, true); - } - } - - #endregion - - public static async Task RunAsync() - { - var builder = Builder.CreateDefaultBuilder(); - builder.UseSerilog(); - await builder.Build() - .RunAsync(); - } - - protected override async Task InitializeAsync(CancellationToken stoppingToken) - { - AddDataFlow(new DataParser()); - AddDataFlow(GetDefaultStorage()); - await AddRequestsAsync(new Request("https://news.cnblogs.com/n/page/1/", new Dictionary - { - { - "网站", "博客园" - } - }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary - { - { - "网站", "博客园" - } - })); - } - - protected override (string Id, string Name) GetIdAndName() - { - return (ObjectId.NewId.ToString(), "博客园"); } } + ```` #### Distributed spider